Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -11,21 +11,22 @@ from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrained
 InputDataClass = NewType("InputDataClass", Any)
 """
-A DataCollator is a function that takes a list of samples from a Dataset
+A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
-and collate them into a batch, as a dictionary of Tensors.
+of Tensors.
 """
 DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, torch.Tensor]])
 def default_data_collator(features: List[InputDataClass]) -> Dict[str, torch.Tensor]:
    """
-    Very simple data collator that simply collates batches of dict-like objects and erforms special handling for potential keys named:
+    Very simple data collator that simply collates batches of dict-like objects and erforms special handling for
+    potential keys named:
        - ``label``: handles a single value (int or float) per object
        - ``label_ids``: handles a list of values per object
-    Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to the model.
+    Des not do any additional preprocessing: property names of the input object will be used as corresponding inputs to
-    See glue and ner for example of how it's useful.
+    the model. See glue and ner for example of how it's useful.
    """
    # In this function we'll make the assumption that all `features` in the batch
@@ -73,11 +74,11 @@ class DataCollatorWithPadding:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            index) among:
+            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              single sequence if provided).
+              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
@@ -87,8 +88,8 @@ class DataCollatorWithPadding:
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            >= 7.5 (Volta).
+            7.5 (Volta).
    """
    tokenizer: PreTrainedTokenizerBase
@@ -117,6 +118,7 @@ class DataCollatorWithPadding:
 class DataCollatorForLanguageModeling:
    """
    Data collator used for language modeling.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling
    """
@@ -198,6 +200,7 @@ class DataCollatorForLanguageModeling:
 class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
    """
    Data collator used for language modeling.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling
    """
@@ -275,8 +278,8 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
    def mask_tokens(self, inputs: torch.Tensor, mask_labels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
-        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
-        Set 'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
        """
        if self.tokenizer.mask_token is None:
@@ -316,6 +319,7 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
 class DataCollatorForSOP(DataCollatorForLanguageModeling):
    """
    Data collator used for sentence order prediction task.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for both masked language modeling and sentence order prediction
    """
@@ -342,8 +346,8 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
-        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10% original.
+        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
-        N-gram not applied yet.
+        original. N-gram not applied yet.
        """
        if self.tokenizer.mask_token is None:
            raise ValueError(
@@ -385,6 +389,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
 class DataCollatorForPermutationLanguageModeling:
    """
    Data collator used for permutation language modeling.
    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for permutation language modeling with procedures specific to XLNet
    """
@@ -425,10 +430,14 @@ class DataCollatorForPermutationLanguageModeling:
        The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be masked)
+            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be masked
+               masked)
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length - span_length]`` and mask tokens ``start_index:start_index + span_length``
+            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in the sequence to be processed), repeat from Step 1.
+               masked
+            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
+               span_length]`` and mask tokens ``start_index:start_index + span_length``
+            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+               the sequence to be processed), repeat from Step 1.
        """
        if self.tokenizer.mask_token is None:
@@ -517,8 +526,7 @@ class DataCollatorForPermutationLanguageModeling:
 @dataclass
 class DataCollatorForNextSentencePrediction:
    """
-    Data collator used for next sentence prediction.
+    Data collator used for next sentence prediction. - collates examples which contains pre-generated negative examples
-    - collates examples which contains pre-generated negative examples
    - preprocesses batches for masked language modeling
    """
@@ -531,9 +539,12 @@ class DataCollatorForNextSentencePrediction:
    def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
        """
-        The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will not generate any negative examples.
+        The input should contain negative examples, :class:`~transformers.DataCollatorForNextSentencePrediction` will
+        not generate any negative examples
        Args:
            examples (:obj:`List[Dict]`): Each dictionary should have the following keys:
                  - ``tokens_a``: A sequence of tokens, which should appear before ``tokens_b`` in the text.
                  - ``tokens_b``: A sequence of tokens, which should appear after ``tokens_a`` in the text.
                  - ``is_random_next``: 1 if this pair is generated randomly, else 0.

--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -23,9 +23,8 @@ class GlueDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
-    Using `HfArgumentParser` we can turn this class
+    Using `HfArgumentParser` we can turn this class into argparse arguments to be able to specify them on the command
-    into argparse arguments to be able to specify them on
+    line.
-    the command line.
    """
    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
@@ -55,8 +54,7 @@ class Split(Enum):
 class GlueDataset(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    args: GlueDataTrainingArguments

--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -19,8 +19,7 @@ logger = logging.get_logger(__name__)
 class TextDataset(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    def __init__(
@@ -91,8 +90,7 @@ class TextDataset(Dataset):
 class LineByLineTextDataset(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
@@ -118,8 +116,7 @@ class LineByLineTextDataset(Dataset):
 class LineByLineWithRefDataset(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
@@ -294,8 +291,7 @@ class LineByLineWithSOPTextDataset(Dataset):
 class TextDatasetForNextSentencePrediction(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    def __init__(

--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -86,8 +86,7 @@ class Split(Enum):
 class SquadDataset(Dataset):
    """
-    This will be superseded by a framework-agnostic approach
+    This will be superseded by a framework-agnostic approach soon.
-    soon.
    """
    args: SquadDataTrainingArguments

--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
-""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
+"""
-modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+ Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+ update `find_best_threshold` scripts for SQuAD V2.0
-In addition to basic functionality, we also compute additional statistics and
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
-plot precision-recall curves if an additional na_prob.json file is provided.
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
-This file is expected to map question ID's to the model's predicted probability
+probability that a question is unanswerable.
-that a question is unanswerable.
 """
@@ -589,8 +589,9 @@ def compute_predictions_log_probs(
    tokenizer,
    verbose_logging,
 ):
-    """XLNet write prediction logic (more complex than Bert's).
+    """
-    Write final predictions to the json file and log-odds of null if needed.
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
    Requires utils_squad_evaluate.py
    """

--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -52,9 +52,9 @@ def glue_convert_examples_to_features(
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
-        containing the task-specific features. If the input is a list of ``InputExamples``, will return
+        task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
-        a list of task-specific ``InputFeatures`` which can be fed to the model.
+        ``InputFeatures`` which can be fed to the model.
    """
    if is_tf_available() and isinstance(examples, tf.data.Dataset):

--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -314,8 +314,8 @@ def squad_convert_examples_to_features(
    tqdm_enabled=True,
 ):
    """
-    Converts a list of examples into a list of features that can be directly given as input to a model.
+    Converts a list of examples into a list of features that can be directly given as input to a model. It is
-    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
@@ -326,8 +326,7 @@ def squad_convert_examples_to_features(
        is_training: whether to create features for model evaluation or model training.
        padding_strategy: Default to "max_length". Which padding strategy to use
        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset,
+            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
-            if 'tf': returns a tf.data.Dataset
        threads: multiple processing threadsa-smi
@@ -528,8 +527,8 @@ def squad_convert_examples_to_features(
 class SquadProcessor(DataProcessor):
    """
-    Processor for the SQuAD data set.
+    Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
-    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+    version 2.0 of SQuAD, respectively.
    """
    train_file = None
@@ -745,9 +744,9 @@ class SquadExample:
 class SquadFeatures:
    """
-    Single squad example features to be fed to a model.
+    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
-    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+    :class:`~transformers.data.processors.squad.SquadExample` using the
-    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+    :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.

--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -55,14 +55,13 @@ class InputExample:
 @dataclass(frozen=True)
 class InputFeatures:
    """
-    A single set of features of data.
+    A single set of features of data. Property names are the same names as the corresponding inputs to a model.
-    Property names are the same names as the corresponding inputs to a model.
    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
+            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
-            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
+            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
@@ -83,7 +82,8 @@ class DataProcessor:
    """Base class for data converters for sequence classification data sets."""
    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors.
+        """
+        Gets an example from a dict with tensorflow tensors.
        Args:
            tensor_dict: Keys and values should match the corresponding Glue
@@ -108,8 +108,10 @@ class DataProcessor:
        raise NotImplementedError()
    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
+        """
-        This method converts examples to the correct format."""
+        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
+        examples to the correct format.
+        """
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example
@@ -253,9 +255,9 @@ class SingleSentenceClassificationProcessor(DataProcessor):
                actual values)
        Returns:
-            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
+            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
-            containing the task-specific features. If the input is a list of ``InputExamples``, will return
+            task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
-            a list of task-specific ``InputFeatures`` which can be fed to the model.
+            ``InputFeatures`` which can be fed to the model.
        """
        if max_length is None:

--- a/src/transformers/data/processors/xnli.py
+++ b/src/transformers/data/processors/xnli.py
@@ -26,8 +26,10 @@ logger = logging.get_logger(__name__)
 class XnliProcessor(DataProcessor):
-    """Processor for the XNLI dataset.
+    """
-    Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207"""
+    Processor for the XNLI dataset. Adapted from
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
+    """
    def __init__(self, language, train_language=None):
        self.language = language

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
 """
-Utilities for working with the local dataset cache.
+Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+https://github.com/allenai/allennlp Copyright by the AllenNLP authors.
-Copyright by the AllenNLP authors.
 """
 import fnmatch
@@ -433,10 +432,9 @@ def add_start_docstrings_to_callable(*docstr):
        note = r"""
    .. note::
-        Although the recipe for forward pass needs to be defined within
+        Although the recipe for forward pass needs to be defined within this function, one should call the
-        this function, one should call the :class:`Module` instance afterwards
+        :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post
-        instead of this since the former takes care of running the
+        processing steps while the latter silently ignores them.
-        pre and post processing steps while the latter silently ignores them.
        """
        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
        return fn
@@ -454,20 +452,18 @@ def add_end_docstrings(*docstr):
 PT_RETURN_INTRODUCTION = r"""
    Returns:
-        :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`:
+        :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` (if
-        A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
+        ``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`torch.FloatTensor`
-        tuple of :obj:`torch.FloatTensor` comprising various elements depending on the configuration
+        comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
-        (:class:`~transformers.{config_class}`) and inputs.
 """
 TF_RETURN_INTRODUCTION = r"""
    Returns:
-        :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`:
+        :class:`~{full_output_type}` or :obj:`tuple(tf.Tensor)`: A :class:`~{full_output_type}` (if
-        A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a
+        ``return_dict=True`` is passed or when ``config.return_dict=True``) or a tuple of :obj:`tf.Tensor` comprising
-        tuple of :obj:`tf.Tensor` comprising various elements depending on the configuration
+        various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
-        (:class:`~transformers.{config_class}`) and inputs.
 """
@@ -831,19 +827,16 @@ def is_remote_url(url_or_filename):
 def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> str:
    """
-    Resolve a model identifier, and a file name, to a HF-hosted url
+    Resolve a model identifier, and a file name, to a HF-hosted url on either S3 or Cloudfront (a Content Delivery
-    on either S3 or Cloudfront (a Content Delivery Network, or CDN).
+    Network, or CDN).
-    Cloudfront is replicated over the globe so downloads are way faster
+    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
-    for the end user (and it also lowers our bandwidth costs). However, it
+    bandwidth costs). However, it is more aggressively cached by default, so may not always reflect the latest changes
-    is more aggressively cached by default, so may not always reflect the
+    to the underlying file (default TTL is 24 hours).
-    latest changes to the underlying file (default TTL is 24 hours).
+    In terms of client-side caching from this library, even though Cloudfront relays the ETags from S3, using one or
-    In terms of client-side caching from this library, even though
+    the other (or switching from one to the other) will affect caching: cached files are not shared between the two
-    Cloudfront relays the ETags from S3, using one or the other
+    because the cached file's name contains a hash of the url.
-    (or switching from one to the other) will affect caching: cached files
-    are not shared between the two because the cached file's name contains
-    a hash of the url.
    """
    endpoint = (
        PRESET_MIRROR_DICT.get(mirror, mirror)
@@ -861,12 +854,10 @@ def hf_bucket_url(model_id: str, filename: str, use_cdn=True, mirror=None) -> st
 def url_to_filename(url, etag=None):
    """
-    Convert `url` into a hashed filename in a repeatable way.
+    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
-    If `etag` is specified, append its hash to the url's, delimited
+    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
-    by a period.
+    identify it as a HDF5 file (see
-    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    so that TF 2.0 can identify it as a HDF5 file
-    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    """
    url_bytes = url.encode("utf-8")
    url_hash = sha256(url_bytes)
@@ -885,8 +876,8 @@ def url_to_filename(url, etag=None):
 def filename_to_url(filename, cache_dir=None):
    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
+    Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    its stored metadata do not exist.
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
@@ -921,10 +912,10 @@ def cached_path(
    local_files_only=False,
 ) -> Optional[str]:
    """
-    Given something that might be a URL (or might be a local path),
+    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
-    determine which. If it's a URL, download the file and cache it, and
+    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
-    return the path to the cached file. If it's already a local path,
+    then return the path
-    make sure the file exists and then return the path.
    Args:
        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
@@ -936,8 +927,8 @@ def cached_path(
            re-extract the archive and overide the folder where it was extracted.
    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
-        Local path (string) otherwise
+        otherwise
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
@@ -1045,12 +1036,12 @@ def get_from_cache(
    local_files_only=False,
 ) -> Optional[str]:
    """
-    Given a URL, look for the corresponding file in the local cache.
+    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
-    If it's not there, download it. Then return the path to the cached file.
+    path to the cached file.
    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
-        Local path (string) otherwise
+        otherwise
    """
    if cache_dir is None:
        cache_dir = TRANSFORMERS_CACHE
@@ -1213,8 +1204,8 @@ def is_tensor(x):
 class ModelOutput(OrderedDict):
    """
    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
-    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a
+    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
-    regular python dictionary.
+    python dictionary.
    .. warning::
        You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`

--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -84,8 +84,8 @@ class TFGenerationMixin:
        Parameters:
            input_ids (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
-                The sequence used as a prompt for the generation. If :obj:`None` the method initializes
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
-                it as an empty :obj:`tf.Tensor` of shape :obj:`(1,)`.
+                :obj:`tf.Tensor` of shape :obj:`(1,)`.
            max_length (:obj:`int`, `optional`, defaults to 20):
                The maximum length of the sequence to be generated.
            min_length (:obj:`int`, `optional`, defaults to 10):
@@ -141,9 +141,9 @@ class TFGenerationMixin:
        Return:
-            :obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
+            :obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size * num_return_sequences,
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            sequence_length)`: The generated sequences. The second dimension (sequence_length) is either equal to
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
+            :obj:`max_length` or shorter if all batches finished early due to the :obj:`eos_token_id`.
        Examples::
@@ -428,8 +428,9 @@ class TFGenerationMixin:
        attention_mask,
        use_cache,
    ):
-        """Generate sequences for each example without beam search (num_beams == 1).
+        """
-        All returned sequence are generated independantly.
+        Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
+        independantly.
        """
        # length of generated sentences / unfinished sentences
@@ -976,7 +977,9 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
    Args:
        logits: logits distribution shape (batch size, vocabulary size)
        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
@@ -1044,9 +1047,8 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
 def sample_without_replacement(logits, num_samples):
    """
-    categorical sampling witouth replacement is currently not implemented
+    categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
-    the gumbel-max trick will do for now
+    https://github.com/tensorflow/tensorflow/issues/9260 for more info
-    see https://github.com/tensorflow/tensorflow/issues/9260 for more info
    """
    z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
    _, indices = tf.nn.top_k(logits + z, num_samples)
@@ -1094,8 +1096,8 @@ class BeamHypotheses(object):
    def is_done(self, best_sum_logprobs, cur_len):
        """
-        If there are enough hypotheses and that none of the hypotheses being generated
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
-        can become better than the worst one in the heap, then we are done with this sentence.
+        one in the heap, then we are done with this sentence.
        """
        if len(self) < self.num_beams:

--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -150,8 +150,8 @@ class GenerationMixin:
        Parameters:
            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                The sequence used as a prompt for the generation. If :obj:`None` the method initializes
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty
-                it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`.
+                :obj:`torch.LongTensor` of shape :obj:`(1,)`.
            decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                initial input_ids for the decoder of encoder-decoder type models. If :obj:`None` then only
                decoder_start_token_id is passed as the first token to the decoder.
@@ -210,9 +210,9 @@ class GenerationMixin:
        Return:
-            :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`:
+            :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
+            batches finished early due to the :obj:`eos_token_id`.
        Examples::
@@ -531,8 +531,9 @@ class GenerationMixin:
        use_cache,
        model_kwargs,
    ):
-        """Generate sequences for each example without beam search (num_beams == 1).
+        """
-        All returned sequence are generated independantly.
+        Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
+        independantly.
        """
        # length of generated sentences / unfinished sentences
        unfinished_sents = input_ids.new(batch_size).fill_(1)
@@ -935,8 +936,10 @@ def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iter
 def set_scores_to_inf_for_banned_tokens(scores: torch.Tensor, banned_tokens: List[List[int]]) -> None:
-    """Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be
+    """
-    a list of list of banned tokens to ban in the format [[batch index, vocabulary position],...]
+    Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a list
+    of list of banned tokens to ban in the format [[batch index, vocabulary position],...
        Args:
            scores: logits distribution of shape (batch size, vocabulary size)
            banned_tokens: list of list of tokens to ban of length (batch_size)
@@ -965,7 +968,9 @@ def top_k_top_p_filtering(
    filter_value: float = -float("Inf"),
    min_tokens_to_keep: int = 1,
 ) -> Tensor:
-    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
    Args:
        logits: logits distribution shape (batch size, vocabulary size)
        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
@@ -1033,8 +1038,8 @@ class BeamHypotheses(object):
    def is_done(self, best_sum_logprobs, cur_len):
        """
-        If there are enough hypotheses and that none of the hypotheses being generated
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
-        can become better than the worst one in the heap, then we are done with this sentence.
+        one in the heap, then we are done with this sentence.
        """
        if len(self) < self.num_beams:

--- a/src/transformers/hf_api.py
+++ b/src/transformers/hf_api.py
@@ -104,11 +104,9 @@ class HfApi:
        """
        Call HF API to sign in a user and get a token if credentials are valid.
-        Outputs:
+        Outputs: token if credentials are valid
-            token if credentials are valid
-        Throws:
+        Throws: requests.exceptions.HTTPError if credentials are invalid
-            requests.exceptions.HTTPError if credentials are invalid
        """
        path = "{}/api/login".format(self.endpoint)
        r = requests.post(path, json={"username": username, "password": password})
@@ -152,8 +150,7 @@ class HfApi:
        """
        Get a presigned url, then upload file to S3.
-        Outputs:
+        Outputs: url: Read-only url for the stored file on S3.
-            url: Read-only url for the stored file on S3.
        """
        urls = self.presign(token, filename=filename, organization=organization)
        # streaming upload:
@@ -206,11 +203,10 @@ class HfApi:
 class TqdmProgressFileReader:
    """
-    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`)
+    Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) and override `f.read()` so as to display a
-    and override `f.read()` so as to display a tqdm progress bar.
+    tqdm progress bar.
-    see github.com/huggingface/transformers/pull/2078#discussion_r354739608
+    see github.com/huggingface/transformers/pull/2078#discussion_r354739608 for implementation details.
-    for implementation details.
    """
    def __init__(self, f: io.BufferedReader):
@@ -254,8 +250,7 @@ class HfFolder:
    @classmethod
    def delete_token(cls):
        """
-        Delete token.
+        Delete token. Do not fail if token does not exist.
-        Do not fail if token does not exist.
        """
        try:
            os.remove(cls.path_token)

--- a/src/transformers/hf_argparser.py
+++ b/src/transformers/hf_argparser.py
@@ -13,12 +13,11 @@ DataClassType = NewType("DataClassType", Any)
 class HfArgumentParser(ArgumentParser):
    """
-    This subclass of `argparse.ArgumentParser` uses type hints on dataclasses
+    This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments.
-    to generate arguments.
-    The class is designed to play well with the native argparse. In particular,
+    The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed)
-    you can add more (non-dataclass backed) arguments to the parser after initialization
+    arguments to the parser after initialization and you'll get the output back after parsing as an additional
-    and you'll get the output back after parsing as an additional namespace.
+    namespace.
    """
    dataclass_types: Iterable[DataClassType]
@@ -27,8 +26,7 @@ class HfArgumentParser(ArgumentParser):
        """
        Args:
            dataclass_types:
-                Dataclass type, or list of dataclass types for which we will "fill" instances
+                Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
-                with the parsed args.
            kwargs:
                (Optional) Passed to `argparse.ArgumentParser()` in the regular way.
        """
@@ -94,33 +92,27 @@ class HfArgumentParser(ArgumentParser):
        """
        Parse command-line args into instances of the specified dataclass types.
-        This relies on argparse's `ArgumentParser.parse_known_args`.
+        This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
-        See the doc at:
        docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
        Args:
            args:
-                List of strings to parse. The default is taken from sys.argv.
+                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
-                (same as argparse.ArgumentParser)
            return_remaining_strings:
                If true, also return a list of remaining argument strings.
            look_for_args_file:
-                If true, will look for a ".args" file with the same base name
+                If true, will look for a ".args" file with the same base name as the entry point script for this
-                as the entry point script for this process, and will append its
+                process, and will append its potential content to the command line args.
-                potential content to the command line args.
            args_filename:
-                If not None, will uses this file instead of the ".args" file
+                If not None, will uses this file instead of the ".args" file specified in the previous argument.
-                specified in the previous argument.
        Returns:
            Tuple consisting of:
-                - the dataclass instances in the same order as they
-                  were passed to the initializer.abspath
+                - the dataclass instances in the same order as they were passed to the initializer.abspath
-                - if applicable, an additional namespace for more
+                - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
-                  (non-dataclass backed) arguments added to the parser
                  after initialization.
-                - The potential list of remaining argument strings.
+                - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
-                  (same as argparse.ArgumentParser.parse_known_args)
        """
        if args_filename or (look_for_args_file and len(sys.argv)):
            if args_filename:
@@ -155,8 +147,8 @@ class HfArgumentParser(ArgumentParser):
    def parse_json_file(self, json_file: str) -> Tuple[DataClass, ...]:
        """
-        Alternative helper method that does not use `argparse` at all,
+        Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
-        instead loading a json file and populating the dataclass types.
+        dataclass types.
        """
        data = json.loads(Path(json_file).read_text())
        outputs = []
@@ -169,8 +161,8 @@ class HfArgumentParser(ArgumentParser):
    def parse_dict(self, args: dict) -> Tuple[DataClass, ...]:
        """
-        Alternative helper method that does not use `argparse` at all,
+        Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
-        instead uses a dict and populating the dataclass types.
+        types.
        """
        outputs = []
        for dtype in self.dataclass_types:

--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -298,8 +298,7 @@ class TensorBoardCallback(TrainerCallback):
 class WandbCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases
+    A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases <https://www.wandb.com/>`__.
-    <https://www.wandb.com/>`__.
    """
    def __init__(self):
@@ -310,8 +309,8 @@ class WandbCallback(TrainerCallback):
        """
        Setup the optional Weights & Biases (`wandb`) integration.
-        One can subclass and override this method to customize the setup if needed. Find more information
+        One can subclass and override this method to customize the setup if needed. Find more information `here
-        `here <https://docs.wandb.com/huggingface>`__. You can also override the following environment variables:
+        <https://docs.wandb.com/huggingface>`__. You can also override the following environment variables:
        Environment:
            WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`):
@@ -368,8 +367,7 @@ class WandbCallback(TrainerCallback):
 class CometCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML
+    A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML <https://www.comet.ml/site/>`__.
-    <https://www.comet.ml/site/>`__.
    """
    def __init__(self):
@@ -388,8 +386,8 @@ class CometCallback(TrainerCallback):
            COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`):
                Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE"
-        For a number of configurable items in the environment,
+        For a number of configurable items in the environment, see `here
-        see `here <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
+        <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
        """
        self._initialized = True
        if state.is_world_process_zero:
@@ -424,8 +422,7 @@ class CometCallback(TrainerCallback):
 class MLflowCallback(TrainerCallback):
    """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow
+    A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow <https://www.mlflow.org/>`__.
-    <https://www.mlflow.org/>`__.
    """
    MAX_LOG_SIZE = 100
@@ -443,10 +440,9 @@ class MLflowCallback(TrainerCallback):
            HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`):
                Whether to use MLflow .log_artifact() facility to log artifacts.
-                This only makes sense if logging to a remote server, e.g. s3 or GCS.
+                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy
-                If set to `True` or `1`, will copy whatever is in TrainerArgument's output_dir
+                whatever is in TrainerArgument's output_dir to the local or remote artifact storage. Using it without a
-                to the local or remote artifact storage. Using it without a remote storage
+                remote storage will just copy the files to your artifact location.
-                will just copy the files to your artifact location.
        """
        log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
        if log_artifacts in {"TRUE", "1"}:

--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -36,18 +36,14 @@ logger = logging.get_logger(__name__)
 class ModelCard:
-    r"""Structured Model Card class.
+    r"""
-    Store model card as well as methods for loading/downloading/saving model cards.
+    Structured Model Card class. Store model card as well as methods for loading/downloading/saving model cards.
-    Please read the following paper for details and explanation on the sections:
+    Please read the following paper for details and explanation on the sections: "Model Cards for Model Reporting" by
-        "Model Cards for Model Reporting"
+    Margaret Mitchell, Simone Wu, Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
-            by Margaret Mitchell, Simone Wu,
+    Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. Link: https://arxiv.org/abs/1810.03993
-            Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
-            Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
-        Link: https://arxiv.org/abs/1810.03993
-    Note:
+    Note: A model card can be loaded and saved to disk.
-        A model card can be loaded and saved to disk.
    Parameters:
    """
@@ -85,37 +81,46 @@ class ModelCard:
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r"""Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+        r"""
+        Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
        Parameters:
            pretrained_model_name_or_path: either:
-                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.:
-                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                  ``bert-base-uncased``.
-                - a path to a `directory` containing a model card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3,
+                  e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a model card file saved using the
+                  :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
+                Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache
-                card should be cached if the standard cache should not be used.
+                should not be used.
            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
-                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded
-                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+                  values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the
+                  `return_unused_kwargs` keyword parameter.
            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
-                The proxies are used on each request.
+                'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
            find_from_standard_name: (`optional`) boolean, default True:
-                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename.
+                If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them
-                Can be used to directly feed a model/config url and access the colocated modelcard.
+                with our standard modelcard filename. Can be used to directly feed a model/config url and access the
+                colocated modelcard.
            return_unused_kwargs: (`optional`) bool:
                - If False, then this function returns just the final model card object.
-                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a
+                  dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of
+                  kwargs which has not been used to update `ModelCard` and is otherwise ignored.
        Examples::

--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -453,8 +453,9 @@ class AlbertTransformer(nn.Module):
 class AlbertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = AlbertConfig
@@ -486,16 +487,16 @@ class AlbertForPreTrainingOutput(ModelOutput):
        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            continuation before SoftMax).
+            before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -514,14 +515,15 @@ ALBERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Args:
        config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 ALBERT_INPUTS_DOCSTRING = r"""
@@ -529,35 +531,33 @@ ALBERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
+            Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.__call__` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for
-            :meth:`transformers.PreTrainedTokenizer.encode` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -615,17 +615,16 @@ class AlbertModel(AlbertPreTrainedModel):
        return self.embeddings.word_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has
-        ALBERT has a different architecture in that its layers are shared across groups, which then has inner groups.
+        a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT
-        If an ALBERT model has 12 hidden layers and 2 hidden groups, with two inner groups, there
+        model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers.
-        is a total of 4 different layers.
        These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer,
        while [2,3] correspond to the two inner groups of the second hidden layer.
-        Any layer with in index other than [0,1,2,3] will result in an error.
+        Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more
-        See base class PreTrainedModel for more information about head pruning
+        information about head pruning
        """
        for layer, heads in heads_to_prune.items():
            group_idx = int(layer / self.config.inner_group_num)
@@ -706,8 +705,10 @@ class AlbertModel(AlbertPreTrainedModel):
 @add_start_docstrings(
-    """Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    """
-    a `sentence order prediction (classification)` head. """,
+    Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a
+    `sentence order prediction (classification)` head.
+    """,
    ALBERT_START_DOCSTRING,
 )
 class AlbertForPreTraining(AlbertPreTrainedModel):
@@ -745,15 +746,13 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
    ):
        r"""
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            Indices should be in ``[0, 1]``.
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
-            ``0`` indicates original order (sequence A, then sequence B),
+            A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
-            ``1`` indicates switched order (sequence B, then sequence A).
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
@@ -903,10 +902,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
@@ -952,8 +950,10 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
 @add_start_docstrings(
-    """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
    ALBERT_START_DOCSTRING,
 )
 class AlbertForSequenceClassification(AlbertPreTrainedModel):
@@ -989,9 +989,8 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1036,8 +1035,10 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
 @add_start_docstrings(
-    """Albert Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    ALBERT_START_DOCSTRING,
 )
 class AlbertForTokenClassification(AlbertPreTrainedModel):
@@ -1076,8 +1077,8 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1123,8 +1124,10 @@ class AlbertForTokenClassification(AlbertPreTrainedModel):
 @add_start_docstrings(
-    """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
@@ -1164,12 +1167,12 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1223,8 +1226,10 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
 @add_start_docstrings(
-    """Albert Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    ALBERT_START_DOCSTRING,
 )
 class AlbertForMultipleChoice(AlbertPreTrainedModel):
@@ -1259,9 +1264,9 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+            num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see
-            of the input tensors. (see `input_ids` above)
+            `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -462,9 +462,9 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
 AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
-        The model class to instantiate is selected based on the :obj:`model_type` property of the config object
+        The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
-        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
+        passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
-        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
        List options
@@ -517,12 +517,10 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-                request.
            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-                messages.
            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to only look at local files (e.g., not try doanloading the model).
            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
@@ -546,8 +544,8 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
 class AutoModel:
    r"""
-    This is a generic model class that will be instantiated as one of the base model classes of the library
+    This is a generic model class that will be instantiated as one of the base model classes of the library when
-    when created with the :meth:`~transformers.AutoModel.from_pretrained` class method or the
+    created with the :meth:`~transformers.AutoModel.from_pretrained` class method or the
    :meth:`~transformers.AutoModel.from_config` class methods.
    This class cannot be instantiated directly using ``__init__()`` (throws an error).
@@ -567,9 +565,8 @@ class AutoModel:
        Instantiates one of the base model classes of the library from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use :meth:`~transformers.AutoModel.from_pretrained` to load
+            model's configuration. Use :meth:`~transformers.AutoModel.from_pretrained` to load the model weights.
-            the model weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -661,9 +658,9 @@ class AutoModelForPreTraining:
        model---from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use
+            model's configuration. Use :meth:`~transformers.AutoModelForPreTraining.from_pretrained` to load the model
-            :meth:`~transformers.AutoModelForPreTraining.from_pretrained` to load the model weights.
+            weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -761,9 +758,9 @@ class AutoModelWithLMHead:
        Instantiates one of the model classes of the library---with a language modeling head---from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use :meth:`~transformers.AutoModelWithLMHead.from_pretrained`
+            model's configuration. Use :meth:`~transformers.AutoModelWithLMHead.from_pretrained` to load the model
-            to load the model weights.
+            weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -844,8 +841,8 @@ class AutoModelWithLMHead:
 class AutoModelForCausalLM:
    r"""
-    This is a generic model class that will be instantiated as one of the model classes of the library---with a
+    This is a generic model class that will be instantiated as one of the model classes of the library---with a causal
-    causal language modeling head---when created with the when created with the
+    language modeling head---when created with the when created with the
    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` class method or the
    :meth:`~transformers.AutoModelForCausalLM.from_config` class method.
@@ -867,9 +864,9 @@ class AutoModelForCausalLM:
        configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use :meth:`~transformers.AutoModelForCausalLM.from_pretrained`
+            model's configuration. Use :meth:`~transformers.AutoModelForCausalLM.from_pretrained` to load the model
-            to load the model weights.
+            weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -938,8 +935,8 @@ class AutoModelForCausalLM:
 class AutoModelForMaskedLM:
    r"""
-    This is a generic model class that will be instantiated as one of the model classes of the library---with a
+    This is a generic model class that will be instantiated as one of the model classes of the library---with a masked
-    masked language modeling head---when created with the when created with the
+    language modeling head---when created with the when created with the
    :meth:`~transformers.AutoModelForMaskedLM.from_pretrained` class method or the
    :meth:`~transformers.AutoModelForMasedLM.from_config` class method.
@@ -961,9 +958,9 @@ class AutoModelForMaskedLM:
        configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use :meth:`~transformers.AutoModelForMaskedLM.from_pretrained`
+            model's configuration. Use :meth:`~transformers.AutoModelForMaskedLM.from_pretrained` to load the model
-            to load the model weights.
+            weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -1055,9 +1052,9 @@ class AutoModelForSeq2SeqLM:
        head---from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use :meth:`~transformers.AutoModelForSeq2SeqLM.from_pretrained`
+            model's configuration. Use :meth:`~transformers.AutoModelForSeq2SeqLM.from_pretrained` to load the model
-            to load the model weights.
+            weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -1153,9 +1150,9 @@ class AutoModelForSequenceClassification:
        configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use
+            model's configuration. Use :meth:`~transformers.AutoModelForSequenceClassification.from_pretrained` to load
-            :meth:`~transformers.AutoModelForSequenceClassification.from_pretrained` to load the model weights.
+            the model weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -1250,9 +1247,9 @@ class AutoModelForQuestionAnswering:
        Instantiates one of the model classes of the library---with a question answering head---from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use
+            model's configuration. Use :meth:`~transformers.AutoModelForQuestionAnswering.from_pretrained` to load the
-            :meth:`~transformers.AutoModelForQuestionAnswering.from_pretrained` to load the model weights.
+            model weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -1327,8 +1324,8 @@ class AutoModelForQuestionAnswering:
 class AutoModelForTokenClassification:
    r"""
-    This is a generic model class that will be instantiated as one of the model classes of the library---with a
+    This is a generic model class that will be instantiated as one of the model classes of the library---with a token
-    token classification head---when created with the when created with the
+    classification head---when created with the when created with the
    :meth:`~transformers.AutoModelForTokenClassification.from_pretrained` class method or the
    :meth:`~transformers.AutoModelForTokenClassification.from_config` class method.
@@ -1349,9 +1346,9 @@ class AutoModelForTokenClassification:
        Instantiates one of the model classes of the library---with a token classification head---from a configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use
+            model's configuration. Use :meth:`~transformers.AutoModelForTokenClassification.from_pretrained` to load
-            :meth:`~transformers.AutoModelForTokenClassification.from_pretrained` to load the model weights.
+            the model weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):
@@ -1449,9 +1446,9 @@ class AutoModelForMultipleChoice:
        configuration.
        Note:
-            Loading a model from its configuration file does **not** load the model weights.
+            Loading a model from its configuration file does **not** load the model weights. It only affects the
-            It only affects the model's configuration. Use
+            model's configuration. Use :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` to load the
-            :meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` to load the model weights.
+            model weights.
        Args:
            config (:class:`~transformers.PretrainedConfig`):

--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -68,14 +68,15 @@ BART_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
@@ -103,14 +104,13 @@ BART_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
-            Indices can be obtained using :class:`~transformers.BartTokenizer`.
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
@@ -127,16 +127,16 @@ BART_INPUTS_DOCSTRING = r"""
            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
            information on the default strategy.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
-            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: :obj:`attentions`)
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
-            :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`) is a
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
-            sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
-            the decoder.
+            cross-attention of the decoder.
        past_key_values (:obj:`Tuple[Dict[str: tf.Tensor]]` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last
+            If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
-            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+            instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
@@ -160,9 +160,10 @@ def invert_mask(attention_mask):
 def _prepare_bart_decoder_inputs(
    config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32
 ):
-    """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
+    """
-    none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
+    Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if none are provided.
-    Note: this is not called during generation
+    This mimics the default behavior in fairseq. To override it pass in masks. Note: this is not called during
+    generation
    """
    pad_token_id = config.pad_token_id
    if decoder_input_ids is None:
@@ -292,8 +293,8 @@ class EncoderLayer(nn.Module):
 class BartEncoder(nn.Module):
    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    is a :class:`EncoderLayer`.
+    :class:`EncoderLayer`.
    Args:
        config: BartConfig
@@ -334,14 +335,14 @@ class BartEncoder(nn.Module):
        Args:
            input_ids (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
-            attention_mask (torch.LongTensor): indicating which indices are padding tokens.
+            attention_mask (torch.LongTensor): indicating which indices are padding tokens
        Returns:
            BaseModelOutput or Tuple comprised of:
-                - **x** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
+                - **x** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)`
-                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate
+                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate hidden states of shape `(src_len,
-                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  batch, embed_dim)`. Only populated if *output_hidden_states:* is True.
-                  Only populated if *output_hidden_states:* is True.
                - **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
                During training might not be of length n_layers because of layer dropout.
        """
@@ -482,8 +483,8 @@ class DecoderLayer(nn.Module):
 class BartDecoder(nn.Module):
    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`DecoderLayer`
-    is a :class:`DecoderLayer`.
    Args:
        config: BartConfig
        embed_tokens (torch.nn.Embedding): output embedding
@@ -530,8 +531,8 @@ class BartDecoder(nn.Module):
        **unused,
    ):
        """
-        Includes several features from "Jointly Learning to Align and
+        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        EMNLP 2019).
        Args:
            input_ids (LongTensor): previous decoder outputs of shape
@@ -543,6 +544,7 @@ class BartDecoder(nn.Module):
        Returns:
            BaseModelOutputWithPast or tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - the cache
                - hidden states
@@ -783,10 +785,9 @@ class BartClassificationHead(nn.Module):
 class LearnedPositionalEmbedding(nn.Embedding):
    """
-    This module learns positional embeddings up to a fixed maximum size.
+    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
-    Padding ids are ignored by either offsetting based on padding_idx
+    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
-    or by setting padding_idx to None and ensuring that the appropriate
+    the forward function.
-    position ids are passed to the forward function.
    """
    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset):
@@ -1000,10 +1001,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
-            Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
-            with labels in ``[0, ..., config.vocab_size]``.
        Returns:
@@ -1128,7 +1128,10 @@ class BartForConditionalGeneration(PretrainedBartModel):
 @add_start_docstrings(
-    """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """,
+    """
+    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
    BART_START_DOCSTRING,
 )
 class BartForSequenceClassification(PretrainedBartModel):
@@ -1166,9 +1169,8 @@ class BartForSequenceClassification(PretrainedBartModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
@@ -1214,8 +1216,10 @@ class BartForSequenceClassification(PretrainedBartModel):
 @add_start_docstrings(
-    """BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    BART_START_DOCSTRING,
 )
 class BartForQuestionAnswering(PretrainedBartModel):
@@ -1254,12 +1258,12 @@ class BartForQuestionAnswering(PretrainedBartModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            Position outside of the sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            Position outside of the sequence are not taken into account for computing the loss.
+            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if start_positions is not None and end_positions is not None:
@@ -1332,8 +1336,9 @@ class SinusoidalPositionalEmbedding(nn.Embedding):
    @staticmethod
    def _init_weight(out: nn.Parameter):
-        """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
+        """
-        The cos features are in the 2nd half of the vector. [dim // 2:]
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
        """
        n_pos, dim = out.shape
        position_enc = np.array(

--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -580,8 +580,9 @@ class BertPreTrainingHeads(nn.Module):
 class BertPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = BertConfig
@@ -614,16 +615,16 @@ class BertForPreTrainingOutput(ModelOutput):
        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
-            continuation before SoftMax).
+            before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -642,14 +643,15 @@ BERT_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -657,35 +659,33 @@ BERT_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.BertTokenizer`.
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`_
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`_
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -712,17 +712,15 @@ BERT_INPUTS_DOCSTRING = r"""
 class BertModel(BertPreTrainedModel):
    """
-    The model can behave as an encoder (with only self-attention) as well
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    as a decoder, in which case a layer of cross-attention is added between
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
-    the self-attention layers, following the architecture described in `Attention is all you need
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
-    <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
-    To behave as an decoder the model needs to be initialized with the
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
-    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    input to the forward pass.
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
    """
    def __init__(self, config, add_pooling_layer=True):
@@ -743,9 +741,9 @@ class BertModel(BertPreTrainedModel):
        self.embeddings.word_embeddings = value
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)
@@ -773,12 +771,11 @@ class BertModel(BertPreTrainedModel):
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            if the model is configured as a decoder.
+            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            is used in the cross-attention if the model is configured as a decoder.
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
@@ -855,8 +852,10 @@ class BertModel(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    """
-    a `next sentence prediction (classification)` head. """,
+    Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
    BERT_START_DOCSTRING,
 )
 class BertForPreTraining(BertPreTrainedModel):
@@ -890,13 +889,12 @@ class BertForPreTraining(BertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            Indices should be in ``[0, 1]``:
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
@@ -1004,20 +1002,18 @@ class BertLMHeadModel(BertPreTrainedModel):
    ):
        r"""
        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            if the model is configured as a decoder.
+            the model is configured as a decoder.
        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            is used in the cross-attention if the model is configured as a decoder.
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
-            n ``[0, ..., config.vocab_size]``
        Returns:
@@ -1132,10 +1128,9 @@ class BertForMaskedLM(BertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-            in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
@@ -1229,7 +1224,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
        r"""
        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
-            (see ``input_ids`` docstring).  Indices should be in ``[0, 1]``:
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
@@ -1288,8 +1283,10 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
    BERT_START_DOCSTRING,
 )
 class BertForSequenceClassification(BertPreTrainedModel):
@@ -1325,9 +1322,8 @@ class BertForSequenceClassification(BertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1372,8 +1368,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    BERT_START_DOCSTRING,
 )
 class BertForMultipleChoice(BertPreTrainedModel):
@@ -1408,9 +1406,9 @@ class BertForMultipleChoice(BertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
@@ -1461,8 +1459,10 @@ class BertForMultipleChoice(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
@@ -1501,8 +1501,8 @@ class BertForTokenClassification(BertPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1550,8 +1550,10 @@ class BertForTokenClassification(BertPreTrainedModel):
 @add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    """
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
@@ -1591,12 +1593,12 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict