Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/modeling_transfo_xl_utilities.py
+++ b/src/transformers/modeling_transfo_xl_utilities.py
@@ -13,8 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Utilities for PyTorch Transformer XL model.
+"""
-    Directly adapted from https://github.com/kimiyoung/transformer-xl.
+ Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
 """
@@ -87,15 +87,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        """
        Params:
            hidden :: [len*bsz x d_proj]
-            labels :: [len*bsz]
+            labels :: [len*bsz
        Return:
-            if labels is None:
+            if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
-                out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
+            [(len-1)*bsz] Negative log likelihood We could replace this implementation by the native PyTorch one if
-            else:
+            their's had an option to set bias on all clusters in the native one. here:
-                out :: [(len-1)*bsz] Negative log likelihood
+            https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
-        We could replace this implementation by the native PyTorch one
-        if their's had an option to set bias on all clusters in the native one.
-        here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        """
        if labels is not None:
@@ -191,15 +189,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
        return out
    def log_prob(self, hidden):
-        r"""Computes log probabilities for all :math:`n\_classes`
+        r"""
-        From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py
+        Computes log probabilities for all :math:`n\_classes` From:
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
        Args:
-            hidden (Tensor): a minibatch of examples
+            hidden (Tensor): a minibatch of example
        Returns:
-            log-probabilities of for each class :math:`c`
+            log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where
-            in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a
+            :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape:
-            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
-        Shape:
            - Input: :math:`(N, in\_features)`
            - Output: :math:`(N, n\_classes)`
        """

--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -287,8 +287,8 @@ class ModuleUtilsMixin:
                Whether or not the attentions scores are computed by chunks or not.
        Returns:
-            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]`
+            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
-            or list with :obj:`[None]` for each layer.
+            list with :obj:`[None]` for each layer.
        """
        if head_mask is not None:
            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
@@ -358,9 +358,9 @@ class ModuleUtilsMixin:
        """
        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
-        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper <https://arxiv.org/pdf/2001.08361.pdf>`__ section
+        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
-        2.1. Should be  overriden for transformers with parameter re-use e.g. Albert or Universal Transformers, or
+        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overriden for transformers with parameter
-        if doing long-range modeling with very high sequence lengths.
+        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
        Args:
            batch_size (:obj:`int`):
@@ -390,23 +390,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        * prune heads in the self-attention heads.
    Class attributes (overridden by derived classes):
        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a
+        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
-          PyTorch model, taking as arguments:
+          model, taking as arguments:
            - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
              TensorFlow checkpoint.
-            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated
+            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
-              to the model.
+              the model.
            - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
          derived classes of the same architecture adding modules on top of the base model.
        - **authorized_missing_keys** (:obj:`Optional[List[str]]`) -- A list of re pattern of tensor names to ignore
          when loading the model (and avoid unnecessary warnings).
-        - **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore
+        - **keys_to_never_save** (:obj:`Optional[List[str]]`) -- A list of of tensor names to ignore when saving the
-          when saving the model (useful for keys that aren't trained, but which are deterministic)
+          model (useful for keys that aren't trained, but which are deterministic)
    """
    config_class = None
@@ -684,9 +685,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        Arguments:
            heads_to_prune (:obj:`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list
+                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
-                of heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will
+                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
-                prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+                0 and 2 on layer 1 and heads 2 and 3 on layer 2.
        """
        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
        for layer, heads in heads_to_prune.items():
@@ -743,8 +744,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        r"""
        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated).
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
-        To train the model, you should first set it back in training mode with ``model.train()``.
+        train the model, you should first set it back in training mode with ``model.train()``.
        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
@@ -806,21 +807,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-                request.
            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-                messages.
            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to only look at local files (e.g., not try doanloading the model).
            use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
                our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
            mirror(:obj:`str`, `optional`, defaults to :obj:`None`):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility problem,
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. Please
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                refer to the mirror site for more information.
+                Please refer to the mirror site for more information.
            kwargs (remaining dictionary of keyword arguments, `optional`):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
@@ -1142,8 +1141,8 @@ class PoolerStartLogits(nn.Module):
            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                1.0 means token should be masked.
+                should be masked.
        Returns:
            :obj:`torch.FloatTensor`: The start logits for SQuAD.
@@ -1192,8 +1191,8 @@ class PoolerEndLogits(nn.Module):
            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                The position of the first token for the labeled span.
            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                1.0 means token should be masked.
+                should be masked.
        .. note::
@@ -1296,13 +1295,15 @@ class SquadHeadOutput(ModelOutput):
    Args:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
+            (beam-search).
        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
@@ -1361,8 +1362,8 @@ class SQuADHead(nn.Module):
            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                Whether the question has a possible answer in the paragraph or not.
            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
-                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS).
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
-                1.0 means token should be masked.
+                should be masked.
            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple.
@@ -1441,8 +1442,8 @@ class SequenceSummary(nn.Module):
    Args:
        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model. Relevant arguments in the config class of the model are (refer to the
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
-            actual config class of your model for the default values it uses):
+            config class of your model for the default values it uses):
            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
@@ -1455,7 +1456,7 @@ class SequenceSummary(nn.Module):
            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
-            - **summary_activation**  (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
+            - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
              output, another string or :obj:`None` will add no activation.
            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
              activation.
@@ -1618,8 +1619,8 @@ def prune_layer(
        dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
    Returns:
-        :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`:
+        :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
-        The pruned layer as a new layer with :obj:`requires_grad=True`.
+        :obj:`requires_grad=True`.
    """
    if isinstance(layer, nn.Linear):
        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
@@ -1647,7 +1648,8 @@ def apply_chunking_to_forward(
        chunk_dim (:obj:`int`):
            The dimension over which the :obj:`input_tensors` should be chunked.
        input_tensors (:obj:`Tuple[torch.Tensor]`):
-            The input tensors of ``forward_fn`` which will be chunked.
+            The input tensors of ``forward_fn`` which will be chunked
    Returns:
        :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`foward_fn` would have given if applied`.

--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch XLM model.
+"""
+ PyTorch XLM model.
 """
@@ -228,8 +229,9 @@ class TransformerFFN(nn.Module):
 class XLMPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = XLMConfig
@@ -278,7 +280,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
+            (beam-search).
        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
@@ -289,8 +292,8 @@ class XLMForQuestionAnsweringOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -312,14 +315,15 @@ XLM_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 XLM_INPUTS_DOCSTRING = r"""
@@ -327,45 +331,43 @@ XLM_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.XLMTokenizer`.
+            Indices can be obtained using :class:`~transformers.XLMTokenizer`. See
-            See :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
-            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            `What are attention masks? <../glossary.html#attention-mask>`__
        langs (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            A parallel sequence of tokens to be used to indicate the language of each token in the input.
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
-            Indices are languages ids which can be obtained from the language names by using two conversion mappings
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
-            provided in the configuration of the model (only provided for multilingual models).
+            the configuration of the model (only provided for multilingual models). More precisely, the `language name
-            More precisely, the `language name to language id` mapping is in :obj:`model.config.lang2id` (which is a
+            to language id` mapping is in :obj:`model.config.lang2id` (which is a dictionary strring to int) and the
-            dictionary strring to int) and the `language id to language name` mapping is in :obj:`model.config.id2lang`
+            `language id to language name` mapping is in :obj:`model.config.id2lang` (dictionary int to string).
-            (dictionary int to string).
            See usage examples detailed in the :doc:`multilingual documentation <../multilingual>`.
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`__
        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            config.max_position_embeddings - 1]``.
            `What are position IDs? <../glossary.html#position-ids>`__
        lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
-            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
+            also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in
-            Indices selected in ``[0, ..., input_ids.size(-1)]``.
+            ``[0, ..., input_ids.size(-1)]``.
        cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
            Dictionary string to ``torch.FloatTensor`` that contains precomputed hidden states (key and values in the
            attention blocks) as computed by the model (see :obj:`cache` output below). Can be used to speed up
@@ -374,8 +376,7 @@ XLM_INPUTS_DOCSTRING = r"""
            The dictionary object will be modified in-place during the forward pass to add newly computed
            hidden-states.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -478,9 +479,9 @@ class XLMModel(XLMPreTrainedModel):
        self.embeddings = new_embeddings
    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
+        """
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        See base class PreTrainedModel
+        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)
@@ -672,8 +673,10 @@ class XLMPredLayer(nn.Module):
 @add_start_docstrings(
-    """The XLM Model transformer with a language modeling head on top
+    """
-    (linear layer with weights tied to the input embeddings). """,
+    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMWithLMHeadModel(XLMPreTrainedModel):
@@ -726,11 +729,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for language modeling.
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
-            All labels set to ``-100`` are ignored (masked), the loss is only
-            computed for labels in ``[0, ..., config.vocab_size]``
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -764,8 +765,10 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 @add_start_docstrings(
-    """XLM Model with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMForSequenceClassification(XLMPreTrainedModel):
@@ -803,9 +806,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -851,8 +853,10 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 @add_start_docstrings(
-    """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
@@ -891,12 +895,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -953,8 +957,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
 @add_start_docstrings(
-    """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMForQuestionAnswering(XLMPreTrainedModel):
@@ -991,19 +997,20 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
-            1.0 means token should be masked. 0.0 mean token is not masked.
+            masked. 0.0 mean token is not masked.
        Returns:
@@ -1067,8 +1074,10 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 @add_start_docstrings(
-    """XLM Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMForTokenClassification(XLMPreTrainedModel):
@@ -1107,8 +1116,8 @@ class XLMForTokenClassification(XLMPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1159,8 +1168,10 @@ class XLMForTokenClassification(XLMPreTrainedModel):
 @add_start_docstrings(
-    """XLM Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
    XLM_START_DOCSTRING,
 )
 class XLMForMultipleChoice(XLMPreTrainedModel):
@@ -1198,9 +1209,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

--- a/src/transformers/modeling_xlm_prophetnet.py
+++ b/src/transformers/modeling_xlm_prophetnet.py
@@ -37,8 +37,8 @@ XLM_PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
 class XLMProphetNetEncoder(ProphetNetEncoder):
    r"""
-    This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the
+    This class overrides :class:`~transformers.ProphetNetEncoder`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    Example::
@@ -59,8 +59,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
 class XLMProphetNetDecoder(ProphetNetDecoder):
    r"""
-    This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the
+    This class overrides :class:`~transformers.ProphetNetDecoder`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    Example::
@@ -81,8 +81,8 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
 class XLMProphetNetModel(ProphetNetModel):
    r"""
-    This class overrides :class:`~transformers.ProphetNetModel`. Please check the
+    This class overrides :class:`~transformers.ProphetNetModel`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    Example::
@@ -104,8 +104,8 @@ class XLMProphetNetModel(ProphetNetModel):
 class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
    r"""
-    This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the
+    This class overrides :class:`~transformers.ProphetNetForConditionalGeneration`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    Example::
@@ -127,8 +127,8 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
 class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
    r"""
-    This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the
+    This class overrides :class:`~transformers.ProphetNetForCausalLM`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    Example::

--- a/src/transformers/modeling_xlm_roberta.py
+++ b/src/transformers/modeling_xlm_roberta.py
@@ -48,14 +48,15 @@ XLM_ROBERTA_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the
-            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            model. Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
@@ -65,8 +66,8 @@ XLM_ROBERTA_START_DOCSTRING = r"""
 )
 class XLMRobertaModel(RobertaModel):
    """
-    This class overrides :class:`~transformers.RobertaModel`. Please check the
+    This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
@@ -78,8 +79,8 @@ class XLMRobertaModel(RobertaModel):
 )
 class XLMRobertaForCausalLM(RobertaForCausalLM):
    """
-    This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the
+    This class overrides :class:`~transformers.RobertaForCausalLM`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
@@ -91,64 +92,72 @@ class XLMRobertaForCausalLM(RobertaForCausalLM):
 )
 class XLMRobertaForMaskedLM(RobertaForMaskedLM):
    """
-    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the
+    This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
-    superclass for the appropriate documentation alongside usage examples.
+    documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
 @add_start_docstrings(
-    """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
+    """
-    on top of the pooled output) e.g. for GLUE tasks. """,
+    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
    """
-    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the
+    This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
    """
-    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the
+    This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForTokenClassification(RobertaForTokenClassification):
    """
-    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the
+    This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
 @add_start_docstrings(
-    """XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    """
-    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    XLM_ROBERTA_START_DOCSTRING,
 )
 class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering):
    """
-    This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the
+    This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
-    superclass for the appropriate documentation alongside usage examples.
+    appropriate documentation alongside usage examples.
    """
    config_class = XLMRobertaConfig
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch XLNet model.
+"""
+ PyTorch XLNet model.
 """
@@ -58,9 +59,9 @@ XLNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
 def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
-    """A map of modules from TF to PyTorch.
+    """
-    I use a map to keep the PyTorch model as
+    A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch
-    identical to the original PyTorch model as possible.
+    model as possible.
    """
    tf_to_pt_map = {}
@@ -541,8 +542,9 @@ class XLNetLayer(nn.Module):
 class XLNetPreTrainedModel(PreTrainedModel):
-    """An abstract class to handle weights initialization and
+    """
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
    """
    config_class = XLNetConfig
@@ -598,8 +600,8 @@ class XLNetModelOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -634,8 +636,8 @@ class XLNetLMHeadModelOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -668,8 +670,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -702,8 +704,8 @@ class XLNetForTokenClassificationOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -738,8 +740,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -774,8 +776,8 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -796,13 +798,15 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
    Args:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
+            (beam-search).
        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
@@ -817,8 +821,8 @@ class XLNetForQuestionAnsweringOutput(ModelOutput):
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
@@ -841,14 +845,15 @@ XLNET_START_DOCSTRING = r"""
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)
-    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ subclass.
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
-    usage and behavior.
+    general usage and behavior.
    Parameters:
        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Initializing with a config file does not load the weights associated with the model, only the
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 XLNET_INPUTS_DOCSTRING = r"""
@@ -856,14 +861,13 @@ XLNET_INPUTS_DOCSTRING = r"""
        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`. See
-            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            details.
            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
@@ -871,8 +875,8 @@ XLNET_INPUTS_DOCSTRING = r"""
            `What are attention masks? <../glossary.html#attention-mask>`__
        mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`):
            Contains pre-computed hidden-states (see :obj:`mems` output below) . Can be used to speed up sequential
-            decoding. The token ids which have their past given to this model should not be passed as
+            decoding. The token ids which have their past given to this model should not be passed as :obj:`input_ids`
-            :obj:`input_ids` as they have already been computed.
+            as they have already been computed.
            :obj::obj:`use_cache` has to be set to :obj:`True` to make use of :obj:`mems`.
        perm_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, sequence_length)`, `optional`):
@@ -881,24 +885,23 @@ XLNET_INPUTS_DOCSTRING = r"""
            - if ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
            - if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
-            If not set, each token attends to all the others (full bidirectional attention).
+            If not set, each token attends to all the others (full bidirectional attention). Only used during
-            Only used during pretraining (to define factorization order) or for sequential decoding (generation).
+            pretraining (to define factorization order) or for sequential decoding (generation).
        target_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_predict, sequence_length)`, `optional`):
-            Mask to indicate the output tokens to use.
+            Mask to indicate the output tokens to use. If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k
-            If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
+            is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
-            Only used during pretraining for partial prediction or for sequential decoding (generation).
+            (generation).
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs.
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            Indices are selected in ``[0, 1]``:
+            1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            `What are token type IDs? <../glossary.html#token-type-ids>`__
        input_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`):
-            Mask to avoid performing attention on padding token indices.
+            Mask to avoid performing attention on padding token indices. Negative of :obj:`attention_mask`, i.e. with 0
-            Negative of :obj:`attention_mask`, i.e. with 0 for real tokens and 1 for padding which is kept for
+            for real tokens and 1 for padding which is kept for compatibility with the original code base.
-            compatibility with the original code base.
            Mask values selected in ``[0, 1]``:
@@ -907,8 +910,7 @@ XLNET_INPUTS_DOCSTRING = r"""
            You can only uses one of :obj:`input_mask` and :obj:`attention_mask`.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules.
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
-            Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
@@ -1279,8 +1281,9 @@ class XLNetModel(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a language modeling head on top
+    """
-    (linear layer with weights tied to the input embeddings). """,
+    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetLMHeadModel(XLNetPreTrainedModel):
@@ -1360,18 +1363,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`):
-            Labels for masked language modeling.
+            Labels for masked language modeling. :obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If
-            :obj:`num_predict` corresponds to :obj:`target_mapping.shape[1]`. If :obj:`target_mapping` is :obj`None`,
+            :obj:`target_mapping` is :obj`None`, then :obj:`num_predict` corresponds to :obj:`sequence_length`.
-            then :obj:`num_predict` corresponds to :obj:`sequence_length`.
            The labels should correspond to the masked input words that should be predicted and depends on
-            :obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a
+            :obj:`target_mapping`. Note in order to perform standard auto-regressive language modeling a `<mask>` token
-            `<mask>` token has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation`
+            has to be added to the :obj:`input_ids` (see the :obj:`prepare_inputs_for_generation` function and examples
-            function and examples below)
+            below)
-            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored, the
-            All labels set to ``-100`` are ignored, the loss is only
+            loss is only computed for labels in ``[0, ..., config.vocab_size]``
-            computed for labels in ``[0, ..., config.vocab_size]``
        Return:
@@ -1447,8 +1448,10 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a sequence classification/regression head on top (a linear layer on top of
+    """
-    the pooled output) e.g. for GLUE tasks. """,
+    XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetForSequenceClassification(XLNetPreTrainedModel):
@@ -1488,9 +1491,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
+            Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1540,8 +1542,10 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a token classification head on top (a linear layer on top of
+    """
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetForTokenClassification(XLNetPreTrainedModel):
@@ -1580,9 +1584,9 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
-            of the input tensors. (see `input_ids` above)
+            `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
@@ -1635,8 +1639,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a multiple choice classification head on top (a linear layer on top of
+    """
-    the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
+    XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RACE/SWAG tasks.
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetForMultipleChoice(XLNetPreTrainedModel):
@@ -1675,9 +1681,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
-            of the input tensors. (See :obj:`input_ids` above)
+            :obj:`input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
@@ -1734,8 +1740,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
@@ -1776,12 +1784,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)
@@ -1841,8 +1849,10 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
 @add_start_docstrings(
-    """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    """
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
    XLNET_START_DOCSTRING,
 )
 class XLNetForQuestionAnswering(XLNetPreTrainedModel):
@@ -1884,19 +1894,20 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            Position outside of the sequence are not taken into account for computing the loss.
+            sequence are not taken into account for computing the loss.
        is_impossible (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
            Labels whether a question has an answer or no answer (SQuAD 2.0)
        cls_index (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
-            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
        p_mask (``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
-            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...).
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
-            1.0 means token should be masked. 0.0 mean token is not masked.
+            masked. 0.0 mean token is not masked.
        Returns:

--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -70,8 +70,8 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in
 def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
-    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0,
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
-    after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
@@ -170,9 +170,8 @@ def get_polynomial_decay_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
 ):
    """
-    Create a schedule with a learning rate that decreases as a polynomial decay
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
-    from the initial lr set in the optimizer to end lr defined by `lr_end`,
+    optimizer to end lr defined by `lr_end`, after a warmup period during which it increases linearly from 0 to the
-    after a warmup period during which it increases linearly from 0 to the
    initial lr set in the optimizer.
    Args:
@@ -189,8 +188,8 @@ def get_polynomial_decay_schedule_with_warmup(
        last_epoch (:obj:`int`, `optional`, defaults to -1):
            The index of the last epoch when resuming training.
-    Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is
+    Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
-    based on the original BERT implementation at
+    implementation at
    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
    Return:
@@ -218,8 +217,8 @@ def get_polynomial_decay_schedule_with_warmup(
 class AdamW(Optimizer):
    """
-    Implements Adam algorithm with weight decay fix as introduced in
+    Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
-    `Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__.
+    <https://arxiv.org/abs/1711.05101>`__.
    Parameters:
        params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
@@ -320,12 +319,13 @@ class AdamW(Optimizer):
 class Adafactor(Optimizer):
    """
-    AdaFactor pytorch implementation can be used as a drop in replacement for Adam
+    AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
-    original fairseq code: https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
+    https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
-    Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235
+    Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235 Note that
-    Note that this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
+    this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
-    *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and `relative_step=False`.
+    *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
+    `relative_step=False`.
    Arguments:
        params (:obj:`Iterable[torch.nn.parameter.Parameter]`):
@@ -352,6 +352,7 @@ class Adafactor(Optimizer):
    This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
    Recommended T5 finetuning settings:
        - Scheduled LR warm-up to fixed LR
        - disable relative updates
        - use clip threshold: https://arxiv.org/abs/2004.14546
@@ -440,7 +441,9 @@ class Adafactor(Optimizer):
        return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
    def step(self, closure=None):
-        """Performs a single optimization step.
+        """
+        Performs a single optimization step
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.

--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -153,8 +153,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
    """
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
-    with the m and v parameters in strange ways as shown in
+    with the m and v parameters in strange ways as shown in `Decoupled Weight Decay Regularization
-    `Decoupled Weight Decay Regularization <https://arxiv.org/abs/1711.05101>`__.
+    <https://arxiv.org/abs/1711.05101>`__.
    Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.
@@ -169,8 +169,8 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
        epsilon (:obj:`float`, `optional`, defaults to 1e-7):
            The epsilon paramenter in Adam, which is a small constant for numerical stability.
        amsgrad (:obj:`bool`, `optional`, default to `False`):
-            Whether to apply AMSGrad varient of this algorithm or not, see
+            Whether to apply AMSGrad varient of this algorithm or not, see `On the Convergence of Adam and Beyond
-            `On the Convergence of Adam and Beyond <https://arxiv.org/abs/1904.09237>`__.
+            <https://arxiv.org/abs/1904.09237>`__.
        weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
            The weight decay to apply.
        include_in_weight_decay (:obj:`List[str]`, `optional`):
@@ -280,11 +280,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
 # Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
 class GradientAccumulator(object):
-    """Gradient accumulation utility.
+    """
-    When used with a distribution strategy, the accumulator should be called in a
+    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
-    replica context. Gradients will be accumulated locally on each replica and
+    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
-    without synchronization. Users should then call ``.gradients``, scale the
+    then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``.
-    gradients if required, and pass the result to ``apply_gradients``.
    """
    # We use the ON_READ synchronization policy so that no synchronization is

--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -128,7 +128,8 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
        task_options (:obj:`Any`, None)
-           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for translation task.
+           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
+           translation task.
    Returns
@@ -239,8 +240,9 @@ class DefaultArgumentHandler(ArgumentHandler):
 class PipelineDataFormat:
    """
-    Base class for all the pipeline supported data format both for reading and writing.
+    Base class for all the pipeline supported data format both for reading and writing. Supported data formats
-    Supported data formats currently includes:
+    currently includes:
    - JSON
    - CSV
    - stdin/stdout (pipe)
@@ -323,8 +325,8 @@ class PipelineDataFormat:
        overwrite=False,
    ) -> "PipelineDataFormat":
        """
-        Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending
+        Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending on
-        on :obj:`format`.
+        :obj:`format`.
        Args:
            format: (:obj:`str`):
@@ -440,8 +442,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
 class PipedPipelineDataFormat(PipelineDataFormat):
    """
-    Read data from piped input to the python process.
+    Read data from piped input to the python process. For multi columns data, columns should separated by \t
-    For multi columns data, columns should separated by \t
    If columns are provided, then the output will be a dictionary with {column_x: value_x}
@@ -517,16 +518,16 @@ PIPELINE_INIT_ARGS = r"""
            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
            must be installed.
-            If no framework is specified, will default to the one currently installed. If no framework is specified
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
-            and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
+            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
-            model is provided.
+            is provided.
        task (:obj:`str`, defaults to :obj:`""`):
            A task-identifier for the pipeline.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to -1):
-            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
-            on the associated CUDA device id.
+            the associated CUDA device id.
        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
 """
@@ -538,8 +539,8 @@ class Pipeline(_ScikitCompat):
    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
    different pipelines.
-    Base class implementing pipelined operations.
+    Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
-    Pipeline workflow is defined as a sequence of the following operations:
+    operations:
        Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
@@ -691,10 +692,12 @@ class Pipeline(_ScikitCompat):
    def _forward(self, inputs, return_tensors=False):
        """
-        Internal framework specific forward dispatching.
+        Internal framework specific forward dispatching
        Args:
            inputs: dict holding all the keyworded arguments for required by the model forward method.
-            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
+            return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array
        Returns:
            Numpy array
        """
@@ -740,16 +743,16 @@ class FeatureExtractionPipeline(Pipeline):
            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
            must be installed.
-            If no framework is specified, will default to the one currently installed. If no framework is specified
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
-            and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
+            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
-            model is provided.
+            is provided.
        task (:obj:`str`, defaults to :obj:`""`):
            A task-identifier for the pipeline.
        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
            Reference to the object in charge of parsing supplied pipeline parameters.
        device (:obj:`int`, `optional`, defaults to -1):
-            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
-            on the associated CUDA device id.
+            the associated CUDA device id.
    """
    def __init__(
@@ -796,25 +799,23 @@ class TextGenerationPipeline(Pipeline):
    task identifier: :obj:`"text-generation"`.
    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
-    objective, which includes the uni-directional models in the library (e.g. gpt2).
+    objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available
-    See the list of available community models on
+    community models on `huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
-    `huggingface.co/models <https://huggingface.co/models?filter=causal-lm>`__.
    """
    # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
    # in https://github.com/rusiaaman/XLNet-gen#methodology
    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
-    XL_PREFIX = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    XL_PREFIX = """
-    (except for Alexei and Maria) are discovered.
+    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
-    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
-    remainder of the story. 1883 Western Siberia,
+    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
-    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
-    Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    father initially slaps him for making such an accusation, Rasputin watches as the
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
-    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    begging for his blessing. <eod> </s> <eos>
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    """
-    with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
    ALLOWED_MODELS = [
        "XLNetLMHeadModel",
@@ -881,12 +882,11 @@ class TextGenerationPipeline(Pipeline):
            prefix (:obj:`str`, `optional`):
                Prefix added to prompt.
            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                method corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework `here <./model.html#generative-models>`__).
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
+            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
            - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
@@ -985,19 +985,19 @@ class TextGenerationPipeline(Pipeline):
 )
 class TextClassificationPipeline(Pipeline):
    """
-    Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the
+    Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
-    `sequence classification examples <../task_summary.html#sequence-classification>`__ for more information.
+    examples <../task_summary.html#sequence-classification>`__ for more information.
    This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
    task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
    sentiments).
-    If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run
+    If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
-    a softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
+    softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
-    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
-    See the up-to-date list of available models on
+    the up-to-date list of available models on `huggingface.co/models
-    `huggingface.co/models <https://huggingface.co/models?filter=text-classification>`__.
+    <https://huggingface.co/models?filter=text-classification>`__.
    """
    def __init__(self, return_all_scores: bool = False, **kwargs):
@@ -1020,8 +1020,7 @@ class TextClassificationPipeline(Pipeline):
                One or several texts (or one list of prompts) to classify.
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the
+            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
-            following keys:
            - **label** (:obj:`str`) -- The label predicted.
            - **score** (:obj:`float`) -- The corresponding probability.
@@ -1085,16 +1084,15 @@ class ZeroShotClassificationPipeline(Pipeline):
    language inference) tasks.
    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
-    pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the
+    pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate
-    candidate label being valid. Any NLI model can be used as long as the first output logit corresponds to
+    label being valid. Any NLI model can be used as long as the first output logit corresponds to `contradiction` and
-    `contradiction` and the last to `entailment`.
+    the last to `entailment`.
-    This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
-    task identifier: :obj:`"zero-shot-classification"`.
+    :obj:`"zero-shot-classification"`.
-    The models that this pipeline can use are models that have been fine-tuned on an NLI task.
+    The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
-    See the up-to-date list of available models on
+    of available models on `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
-    `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
    """
    def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
@@ -1126,21 +1124,20 @@ class ZeroShotClassificationPipeline(Pipeline):
                The set of possible class labels to classify each sequence into. Can be a single label, a string of
                comma-separated labels, or a list of labels.
            hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`):
-                The template used to turn each label into an NLI-style hypothesis. This template must include a {}
+                The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
-                or similar syntax for the candidate label to be inserted into the template. For example, the default
+                similar syntax for the candidate label to be inserted into the template. For example, the default
                template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed
                into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
                default template works well in many cases, but it may be worthwhile to experiment with different
                templates depending on the task setting.
            multi_class (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized
+                Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such
-                such that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are
+                that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered
-                considered independent and probabilities are normalized for each candidate by doing a softmax of
+                independent and probabilities are normalized for each candidate by doing a softmax of the entailment
-                the entailment score vs. the contradiction score.
+                score vs. the contradiction score.
        Return:
-            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the
+            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **sequence** (:obj:`str`) -- The sequence for which this is the output.
            - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
@@ -1188,15 +1185,14 @@ class ZeroShotClassificationPipeline(Pipeline):
 )
 class FillMaskPipeline(Pipeline):
    """
-    Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the
+    Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the `masked language modeling
-    `masked language modeling examples <../task_summary.html#masked-language-modeling>`__ for more information.
+    examples <../task_summary.html#masked-language-modeling>`__ for more information.
-    This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    task identifier: :obj:`"fill-mask"`.
+    identifier: :obj:`"fill-mask"`.
    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
-    which includes the bi-directional models in the library.
+    which includes the bi-directional models in the library. See the up-to-date list of available models on
-    See the up-to-date list of available models on
    `huggingface.co/models <https://huggingface.co/models?filter=masked-lm>`__.
    .. note::
@@ -1262,14 +1258,13 @@ class FillMaskPipeline(Pipeline):
                One or several texts (or one list of prompts) with masked tokens.
            targets (:obj:`str` or :obj:`List[str]`, `optional`):
                When passed, the model will return the scores for the passed token or tokens rather than the top k
-                predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will
+                predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
-                be tokenized and the first resulting token will be used (with a warning).
+                tokenized and the first resulting token will be used (with a warning).
            top_k (:obj:`int`, `optional`):
                When passed, overrides the number of predictions to return.
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the
+            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
-            following keys:
            - **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
            - **score** (:obj:`float`) -- The corresponding probability.
@@ -1369,16 +1364,16 @@ class FillMaskPipeline(Pipeline):
 )
 class TokenClassificationPipeline(Pipeline):
    """
-    Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the
+    Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the `named entity recognition
-    `named entity recognition examples <../task_summary.html#named-entity-recognition>`__ for more information.
+    examples <../task_summary.html#named-entity-recognition>`__ for more information.
    This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
    task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
    or miscellaneous).
-    The models that this pipeline can use are models that have been fine-tuned on a token classification task.
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
-    See the up-to-date list of available models on
+    up-to-date list of available models on `huggingface.co/models
-    `huggingface.co/models <https://huggingface.co/models?filter=token-classification>`__.
+    <https://huggingface.co/models?filter=token-classification>`__.
    """
    default_input_names = "sequences"
@@ -1560,11 +1555,11 @@ NerPipeline = TokenClassificationPipeline
 class QuestionAnsweringArgumentHandler(ArgumentHandler):
    """
-    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
-    to internal :class:`~transformers.SquadExample`.
+    internal :class:`~transformers.SquadExample`.
-    QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from
+    QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from the
-    the command-line supplied arguments.
+    command-line supplied arguments.
    """
    def __call__(self, *args, **kwargs):
@@ -1623,15 +1618,15 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class QuestionAnsweringPipeline(Pipeline):
    """
-    Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the
+    Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the `question answering examples
-    `question answering examples <../task_summary.html#question-answering>`__ for more information.
+    <../task_summary.html#question-answering>`__ for more information.
    This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
    task identifier: :obj:`"question-answering"`.
-    The models that this pipeline can use are models that have been fine-tuned on a question answering task.
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
-    See the up-to-date list of available models on
+    up-to-date list of available models on `huggingface.co/models
-    `huggingface.co/models <https://huggingface.co/models?filter=question-answering>`__.
+    <https://huggingface.co/models?filter=question-answering>`__.
    """
    default_input_names = "question,context"
@@ -1666,9 +1661,8 @@ class QuestionAnsweringPipeline(Pipeline):
        question: Union[str, List[str]], context: Union[str, List[str]]
    ) -> Union[SquadExample, List[SquadExample]]:
        """
-        QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally.
+        QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. This helper method
-        This helper method encapsulate all the logic for converting question(s) and context(s) to
+        encapsulate all the logic for converting question(s) and context(s) to :class:`~transformers.SquadExample`.
-        :class:`~transformers.SquadExample`.
        We currently support extractive question answering.
@@ -1677,8 +1671,8 @@ class QuestionAnsweringPipeline(Pipeline):
            context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer.
        Returns:
-            One or a list of :class:`~transformers.SquadExample`: The corresponding
+            One or a list of :class:`~transformers.SquadExample`: The corresponding :class:`~transformers.SquadExample`
-            :class:`~transformers.SquadExample` grouping question and context.
+            grouping question and context.
        """
        if isinstance(question, list):
            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
@@ -1693,11 +1687,11 @@ class QuestionAnsweringPipeline(Pipeline):
            args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`):
                One or several :class:`~transformers.SquadExample` containing the question and context.
            X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
-                One or several :class:`~transformers.SquadExample` containing the question and context
+                One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
-                (will be treated the same way as if passed as the first positional argument).
+                the same way as if passed as the first positional argument).
            data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
-                One or several :class:`~transformers.SquadExample` containing the question and context
+                One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
-                (will be treated the same way as if passed as the first positional argument).
+                the same way as if passed as the first positional argument).
            question (:obj:`str` or :obj:`List[str]`):
                One or several question(s) (must be used in conjunction with the :obj:`context` argument).
            context (:obj:`str` or :obj:`List[str]`):
@@ -1719,8 +1713,7 @@ class QuestionAnsweringPipeline(Pipeline):
                Whether or not we accept impossible as an answer.
        Return:
-            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the
+            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **score** (:obj:`float`) -- The probability associated to the answer.
            - **start** (:obj:`int`) -- The start index of the answer (in the tokenized version of the input).
@@ -1825,12 +1818,12 @@ class QuestionAnsweringPipeline(Pipeline):
    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
        """
-        Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be
+        Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
-        the actual answer.
+        actual answer.
-        In addition, it filters out some unwanted/impossible cases like answer len being greater than
+        In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
-        max_answer_len or answer end position being before the starting position.
+        answer end position being before the starting position. The method supports output the k-best answer through
-        The method supports output the k-best answer through the topk argument.
+        the topk argument.
        Args:
            start (:obj:`np.ndarray`): Individual start probabilities for each token.
@@ -1866,8 +1859,7 @@ class QuestionAnsweringPipeline(Pipeline):
    def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
        """
-        When decoding from token probabilities, this method maps token indexes to actual word in
+        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
-        the initial context.
        Args:
            text (:obj:`str`): The actual context to extract the answer from.
@@ -1914,13 +1906,12 @@ class SummarizationPipeline(Pipeline):
    """
    Summarize news articles and other documents.
-    This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    task identifier: :obj:`"summarization"`.
+    identifier: :obj:`"summarization"`.
-    The models that this pipeline can use are models that have been fine-tuned on a summarization task,
+    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
-    which is currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'.
+    currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date
-    See the up-to-date list of available models on
+    list of available models on `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
-    `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
    Usage::
@@ -1957,17 +1948,16 @@ class SummarizationPipeline(Pipeline):
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                method corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework `here <./model.html#generative-models>`__).
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
+            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding
              input.
-            - **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
+            - **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) --
-              -- The token ids of the summary.
+              The token ids of the summary.
        """
        assert return_tensors or return_text, "You must specify return_tensors=True or return_text=True"
        assert len(documents) > 0, "Please provide a document to summarize"
@@ -2043,12 +2033,12 @@ class TranslationPipeline(Pipeline):
    """
    Translates from one language to another.
-    This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    task identifier: :obj:`"translation_xx_to_yy"`.
+    identifier: :obj:`"translation_xx_to_yy"`.
-    The models that this pipeline can use are models that have been fine-tuned on a translation task.
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
-    See the up-to-date list of available models on
+    up-to-date list of available models on `huggingface.co/models
-    `huggingface.co/models <https://huggingface.co/models?filter=translation>`__.
+    <https://huggingface.co/models?filter=translation>`__.
    Usage::
        en_fr_translator = pipeline("translation_en_to_fr")
@@ -2078,12 +2068,11 @@ class TranslationPipeline(Pipeline):
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                method corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework `here <./model.html#generative-models>`__).
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
+            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation.
            - **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
@@ -2153,12 +2142,11 @@ class Text2TextGenerationPipeline(Pipeline):
    """
    Pipeline for text to text generation using seq2seq models.
-    This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the
-    task identifier: :obj:`"text2text-generation"`.
+    following task identifier: :obj:`"text2text-generation"`.
-    The models that this pipeline can use are models that have been fine-tuned on a translation task.
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
-    See the up-to-date list of available models on
+    up-to-date list of available models on `huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
-    `huggingface.co/models <https://huggingface.co/models?filter=seq2seq>`__.
    Usage::
@@ -2191,12 +2179,11 @@ class Text2TextGenerationPipeline(Pipeline):
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                method corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework `here <./model.html#generative-models>`__).
        Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the
+            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
-            following keys:
            - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
            - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
@@ -2346,10 +2333,8 @@ class Conversation:
        Return:
            :obj:`str`:
-            Example:
+            Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
-            Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114
+            suggestions? bot >> The Big Lebowski
-            user >> Going to the movies tonight - any suggestions?
-            bot >> The Big Lebowski
        """
        output = "Conversation id: {} \n".format(self.uuid)
        for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
@@ -2371,13 +2356,13 @@ class ConversationalPipeline(Pipeline):
    """
    Multi-turn conversational pipeline.
-    This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
+    This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    task identifier: :obj:`"conversational"`.
+    identifier: :obj:`"conversational"`.
    The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task,
-    currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`.
+    currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. See the
-    See the up-to-date list of available models on
+    up-to-date list of available models on `huggingface.co/models
-    `huggingface.co/models <https://huggingface.co/models?filter=conversational>`__.
+    <https://huggingface.co/models?filter=conversational>`__.
    Usage::
@@ -2419,8 +2404,8 @@ class ConversationalPipeline(Pipeline):
            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
-                Additional keyword arguments to pass along to the generate method of the model (see the generate
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                method corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework `here <./model.html#generative-models>`__).
        Returns:
            :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with
@@ -2506,8 +2491,9 @@ class ConversationalPipeline(Pipeline):
        """
        Cleans the padding history. Padding may be generated in two places when multiple conversations are provided as
        an input:
            - at the end of the concatenated history and new user input, so that all input to the model have the same
-                length
+              length
            - at the end of the generated response, as some responses will be longer than others
        This method cleans up these padding token so that the history for each conversation is not impacted by the
        batching process.
@@ -2651,8 +2637,8 @@ SUPPORTED_TASKS = {
 def check_task(task: str) -> Tuple[Dict, Any]:
    """
-    Checks an incoming task string, to validate it's correct and return the
+    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
-    default Pipeline and Model classes, and default models if they exist.
+    default models if they exist.
    Args:
        task (:obj:`str`):
@@ -2670,9 +2656,8 @@ def check_task(task: str) -> Tuple[Dict, Any]:
            - :obj:`"conversational"`
    Returns:
-        (task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None))
+        (task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) The actual dictionnary required to initialize
-            The actual dictionnary required to initialize the pipeline and some
+        the pipeline and some extra task options for parametrized tasks like "translation_XX_to_YY"
-            extra task options for parametrized tasks like "translation_XX_to_YY"
    """
@@ -2737,17 +2722,16 @@ def pipeline(
            If not provided, the default for the :obj:`task` will be loaded.
        tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`):
            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
-            identifier or an actual pretrained tokenizer inheriting from
+            identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`.
-            :class:`~transformers.PreTrainedTokenizer`.
            If not provided, the default for the :obj:`task` will be loaded.
        framework (:obj:`str`, `optional`):
            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
            must be installed.
-            If no framework is specified, will default to the one currently installed. If no framework is specified
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
-            and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no
+            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
-            model is provided.
+            is provided.
        use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
        kwargs:

--- a/src/transformers/retrieval_rag.py
+++ b/src/transformers/retrieval_rag.py
@@ -75,7 +75,8 @@ class Index:
        Returns:
            :obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
-            :obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
+            :obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of
+            retrieved documents.
        """
        raise NotImplementedError
@@ -87,16 +88,17 @@ class Index:
    def init_index(self):
        """
-        A function responsible for loading the index into memory. Should be called only once per training run of a RAG model.
+        A function responsible for loading the index into memory. Should be called only once per training run of a RAG
-        E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load the index.
+        model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
+        the index.
        """
        raise NotImplementedError
 class LegacyIndex(Index):
    """
-    An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR.
+    An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
-    We use default faiss index parameters as specified in that repository.
+    default faiss index parameters as specified in that repository.
    Args:
        vector_size (:obj:`int`):
@@ -234,17 +236,20 @@ class HFIndexBase(Index):
 class CanonicalHFIndex(HFIndexBase):
    """
-    A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``,
+    A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, we load the
-    we load the pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from the indicated path on disk.
+    pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from
+    the indicated path on disk.
    Args:
        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
        dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
-            A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids with ``datasets.list_datasets()``).
+            A datatset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
+            with ``datasets.list_datasets()``).
        dataset_split (:obj:`str`, optional, defaults to ``train``)
            Which split of the ``dataset`` to load.
        index_name (:obj:`str`, optional, defaults to ``train``)
-            The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be saved under this name.
+            The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be
+            saved under this name.
        index_path (:obj:`str`, optional, defaults to ``None``)
            The path to the serialized faiss index on disk.
        use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests.
@@ -292,14 +297,14 @@ class CanonicalHFIndex(HFIndexBase):
 class CustomHFIndex(HFIndexBase):
    """
-    A wrapper around an instance of :class:`~datasets.Datasets`.
+    A wrapper around an instance of :class:`~datasets.Datasets`. The dataset and the index are both loaded from the
-    The dataset and the index are both loaded from the indicated paths on disk.
+    indicated paths on disk.
    Args:
        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
        dataset_path (:obj:`str`):
-            The path to the serialized dataset on disk.
+            The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
-            The dataset should have 3 columns: title (str), text (str) and embeddings (arrays of dimension vector_size)
+            embeddings (arrays of dimension vector_size)
        index_path (:obj:`str`)
            The path to the serialized faiss index on disk.
    """
@@ -328,17 +333,17 @@ class CustomHFIndex(HFIndexBase):
 class RagRetriever:
    """
-    Retriever used to get documents from vector queries.
+    Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
-    It retrieves the documents embeddings as well as the documents contents, and it formats them to be used with a RagModel.
+    contents, and it formats them to be used with a RagModel.
    Args:
        config (:class:`~transformers.RagConfig`):
-            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
+            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
-            You can load your own custom dataset with ``config.index_name="custom"`` or use a canonical one (default) from the datasets library
+            ``Index`` to build. You can load your own custom dataset with ``config.index_name="custom"`` or use a
-            with ``config.index_name="wiki_dpr"`` for example.
+            canonical one (default) from the datasets library with ``config.index_name="wiki_dpr"`` for example.
        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            The tokenizer that was used to tokenize the question.
+            The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
-            It is used to decode the question and then use the generator_tokenizer.
+            generator_tokenizer.
        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
            The tokenizer used for the generator part of the RagModel.
        index (:class:`~transformers.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
@@ -470,8 +475,8 @@ class RagRetriever:
                Prefix added at the beginning of each input, typically used with T5-based models.
        Return:
-            :obj:`tuple(tensors)`:
+            :obj:`tuple(tensors)`: a tuple consisting of two elements: contextualized ``input_ids`` and a compatible
-                a tuple consisting of two elements: contextualized ``input_ids`` and a compatible ``attention_mask``.
+            ``attention_mask``.
        """
        def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
@@ -542,11 +547,10 @@ class RagRetriever:
                The number of docs retrieved per query.
        Return:
-            :obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`:
+            :obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
-            A tuple with the following objects:
-            - **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The
+            - **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The retrieval
-              retrieval embeddings of the retrieved docs per query.
+              embeddings of the retrieved docs per query.
            - **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the
              index
            - **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query.
@@ -581,16 +585,18 @@ class RagRetriever:
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-        Output:
+        Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+        fields:
            - **context_input_ids** -- List of token ids to be fed to a model.
              `What are input IDs? <../glossary.html#input-ids>`__
-            - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+            - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
+            (when :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
              `What are attention masks? <../glossary.html#attention-mask>`__
            - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
            - **doc_ids** -- List of ids of the retrieved documents
        """

--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -88,8 +88,8 @@ def is_pipeline_test(test_case):
    """
    Decorator marking a test as a pipeline test.
-    Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable
+    Pipeline tests are skipped by default and we can run only them by setting RUN_PIPELINE_TEST environment variable to
-    to a truthy value and selecting the is_pipeline_test pytest mark.
+    a truthy value and selecting the is_pipeline_test pytest mark.
    """
    if not _run_pipeline_tests:
@@ -107,8 +107,7 @@ def slow(test_case):
    """
    Decorator marking a test as slow.
-    Slow tests are skipped by default. Set the RUN_SLOW environment variable
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
-    to a truthy value to run them.
    """
    if not _run_slow_tests:
@@ -121,9 +120,8 @@ def custom_tokenizers(test_case):
    """
    Decorator marking a test for a custom tokenizer.
-    Custom tokenizers require additional dependencies, and are skipped
+    Custom tokenizers require additional dependencies, and are skipped by default. Set the RUN_CUSTOM_TOKENIZERS
-    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    environment variable to a truthy value to run them.
-    to a truthy value to run them.
    """
    if not _run_custom_tokenizers:
        return unittest.skip("test of custom tokenizers")(test_case)
@@ -201,8 +199,7 @@ def require_torch_multigpu(test_case):
    These tests are skipped on a machine without multiple GPUs.
-    To run *only* the multigpu tests, assuming all test names contain multigpu:
+    To run *only* the multigpu tests, assuming all test names contain multigpu: $ pytest -sv ./tests -k "multigpu"
-    $ pytest -sv ./tests -k "multigpu"
    """
    if not _torch_available:
        return unittest.skip("test requires PyTorch")(test_case)
@@ -306,8 +303,8 @@ def get_tests_dir(append_path=None):
        append_path: optional path to append to the tests dir path
    Return:
-        The full path to the `tests` dir, so that the tests can be invoked from anywhere.
+        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
-        Optionally `append_path` is joined after the `tests` dir the former is provided.
+        joined after the `tests` dir the former is provided.
    """
    # this function caller's __file__
@@ -344,30 +341,29 @@ def assert_screenout(out, what):
 class CaptureStd:
-    """Context manager to capture:
+    """
-    stdout, clean it up and make it available via obj.out
+    Context manager to capture:
-    stderr, and make it available via obj.err
+        stdout, clean it up and make it available via obj.out stderr, and make it available via obj.err
-    init arguments:
+        init arguments: - out - capture stdout: True/False, default True - err - capture stdout: True/False, default
-    - out - capture stdout: True/False, default True
+        True
-    - err - capture stdout: True/False, default True
-    Examples::
+        Examples::
-        with CaptureStdout() as cs:
+            with CaptureStdout() as cs:
-            print("Secret message")
+                print("Secret message")
-        print(f"captured: {cs.out}")
+            print(f"captured: {cs.out}")
-        import sys
+            import sys
-        with CaptureStderr() as cs:
+            with CaptureStderr() as cs:
-            print("Warning: ", file=sys.stderr)
+                print("Warning: ", file=sys.stderr)
-        print(f"captured: {cs.err}")
+            print(f"captured: {cs.err}")
-        # to capture just one of the streams, but not the other
+            # to capture just one of the streams, but not the other
-        with CaptureStd(err=False) as cs:
+            with CaptureStd(err=False) as cs:
-            print("Secret message")
+                print("Secret message")
-        print(f"captured: {cs.out}")
+            print(f"captured: {cs.out}")
-        # but best use the stream-specific subclasses
+            # but best use the stream-specific subclasses
    """
@@ -436,7 +432,8 @@ class CaptureStderr(CaptureStd):
 class CaptureLogger:
-    """Context manager to capture `logging` streams
+    """
+    Context manager to capture `logging` streams
    Args:
    - logger: 'logging` logger object
@@ -476,13 +473,12 @@ class CaptureLogger:
 class TestCasePlus(unittest.TestCase):
-    """This class extends `unittest.TestCase` with additional features.
+    """
+    This class extends `unittest.TestCase` with additional features.
-    Feature 1: Flexible auto-removable temp dirs which are guaranteed to get
+    Feature 1: Flexible auto-removable temp dirs which are guaranteed to get removed at the end of test.
-    removed at the end of test.
-    In all the following scenarios the temp dir will be auto-removed at the end
+    In all the following scenarios the temp dir will be auto-removed at the end of test, unless `after=False`.
-    of test, unless `after=False`.
    # 1. create a unique temp dir, `tmp_dir` will contain the path to the created temp dir
@@ -491,38 +487,35 @@ class TestCasePlus(unittest.TestCase):
        def test_whatever(self):
            tmp_dir = self.get_auto_remove_tmp_dir()
-    # 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to
+    # 2. create a temp dir of my choice and delete it at the end - useful for debug when you want to # monitor a
-    # monitor a specific directory
+    specific directory
    ::
        def test_whatever(self):
            tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test")
-    # 3. create a temp dir of my choice and do not delete it at the end - useful for when you want
+    # 3. create a temp dir of my choice and do not delete it at the end - useful for when you want # to look at the
-    # to look at the temp results
+    temp results
    ::
        def test_whatever(self):
            tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False)
-    # 4. create a temp dir of my choice and ensure to delete it right away - useful for when you
+    # 4. create a temp dir of my choice and ensure to delete it right away - useful for when you # disabled deletion in
-    # disabled deletion in the previous test run and want to make sure the that tmp dir is empty
+    the previous test run and want to make sure the that tmp dir is empty # before the new test is run
-    # before the new test is run
    ::
        def test_whatever(self):
            tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True)
-    Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the
+    Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are
-    project repository checkout are allowed if an explicit `tmp_dir` is used, so
+    allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem
-    that by mistake no `/tmp` or similar important part of the filesystem will
+    will get nuked. i.e. please always pass paths that start with `./`
-    get nuked. i.e. please always pass paths that start with `./`
-    Note 2: Each test can register multiple temp dirs and they all will get
+    Note 2: Each test can register multiple temp dirs and they all will get auto-removed, unless requested otherwise.
-    auto-removed, unless requested otherwise.
    """
@@ -540,8 +533,8 @@ class TestCasePlus(unittest.TestCase):
                delete the tmp dir at the end of the test
        Returns:
-            tmp_dir(:obj:`string`):
+            tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-created tmp
-                either the same value as passed via `tmp_dir` or the path to the auto-created tmp dir
+            dir
        """
        if tmp_dir is not None:
            # using provided path
@@ -577,11 +570,10 @@ class TestCasePlus(unittest.TestCase):
 def mockenv(**kwargs):
-    """this is a convenience wrapper, that allows this:
+    """
+    this is a convenience wrapper, that allows this:
-    @mockenv(RUN_SLOW=True, USE_TF=False)
+    @mockenv(RUN_SLOW=True, USE_TF=False) def test_something(): run_slow = os.getenv("RUN_SLOW", False) use_tf =
-    def test_something():
+    os.getenv("USE_TF", False)
-        run_slow = os.getenv("RUN_SLOW", False)
-        use_tf = os.getenv("USE_TF", False)
    """
    return unittest.mock.patch.dict(os.environ, kwargs)
--- a/src/transformers/tokenization_albert.py
+++ b/src/transformers/tokenization_albert.py
@@ -78,35 +78,33 @@ class AlbertTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-    Attributes:
+    Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
-        sp_model (:obj:`SentencePieceProcessor`):
+    conversion (string, tokens and IDs).
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -224,9 +222,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An ALBERT sequence has the following format:
-        An ALBERT sequence has the following format:
        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
@@ -281,8 +278,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        An ALBERT sequence pair mask has the following format:
+        sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_albert_fast.py
+++ b/src/transformers/tokenization_albert_fast.py
@@ -71,10 +71,11 @@ SPIECE_UNDERLINE = "▁"
 class AlbertTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    `SentencePiece <https://github.com/google/sentencepiece>`__.
+    <https://github.com/google/sentencepiece>`__. This tokenizer inherits from
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
-    methods. Users should refer to this superclass for more information regarding those methods.
+    superclass for more information regarding those methods
    Args:
        vocab_file (:obj:`str`):
            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
@@ -87,31 +88,26 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
            Whether or not to keep accents when tokenizing.
        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-            .. note::
+            .. note:: When building a sequence using special tokens, this is not the token that is used for the
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+            beginning of sequence. The token used is the :obj:`cls_token`.
-                of sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The end of sequence token.
+            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            .. note::
+            that is used for the end of sequence. The token used is the :obj:`sep_token`.
-                When building a sequence using special tokens, this is not the token that is used for the end
-                of sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
+            modeling. This is the token which the model will try to predict. Attributes:
-    Attributes:
        sp_model (:obj:`SentencePieceProcessor`):
            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
@@ -162,9 +158,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An ALBERT sequence has the following format:
-        An ALBERT sequence has the following format:
        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
@@ -219,8 +214,8 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        An ALBERT sequence pair mask has the following format:
+        sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -221,8 +221,8 @@ SLOW_TOKENIZER_MAPPING = {
 class AutoTokenizer:
    r"""
-    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library
+    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
-    when created with the :meth:`AutoTokenizer.from_pretrained` class method.
+    created with the :meth:`AutoTokenizer.from_pretrained` class method.
    This class cannot be instantiated directly using ``__init__()`` (throws an error).
    """
@@ -257,8 +257,8 @@ class AutoTokenizer:
                      using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
                      ``./my_model_directory/``.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
-                      single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+                      single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
-                      (Not applicable to all derived classes)
+                      applicable to all derived classes)
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__()`` method.
            config (:class:`~transformers.PreTrainedConfig`, `optional`)
@@ -273,9 +273,8 @@ class AutoTokenizer:
                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                file exists.
            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-                request.
            use_fast (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to try to load the fast version of the tokenizer.
            kwargs (additional keyword arguments, `optional`):

--- a/src/transformers/tokenization_bart.py
+++ b/src/transformers/tokenization_bart.py
@@ -44,8 +44,8 @@ class BartTokenizer(RobertaTokenizer):
    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
    :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
-    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
+    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
-    the initialization parameters and other methods.
+    initialization parameters and other methods.
    """
    # merges and vocab same as Roberta
    max_model_input_sizes = {m: 1024 for m in _all_bart_models}
@@ -75,13 +75,13 @@ class BartTokenizer(RobertaTokenizer):
            tgt_texts: (:obj:`List[str]`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts).
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
-                Controls the maximum length of decoder inputs (target language texts or summaries).
+                Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
-                If left unset or set to :obj:`None`, this will use the max_length value.
+                set to :obj:`None`, this will use the max_length value.
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:
@@ -122,8 +122,8 @@ class BartTokenizer(RobertaTokenizer):
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts
-            The full set of keys ``[input_ids, attention_mask, labels]``,
+            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
-            will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
+            Otherwise, input_ids, attention_mask will be the only keys.
        """
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)

--- a/src/transformers/tokenization_bart_fast.py
+++ b/src/transformers/tokenization_bart_fast.py
@@ -70,13 +70,13 @@ class BartTokenizerFast(RobertaTokenizerFast):
            tgt_texts: (:obj:`List[str]`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts).
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
-                Controls the maximum length of decoder inputs (target language texts or summaries).
+                Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
-                If left unset or set to :obj:`None`, this will use the max_length value.
+                set to :obj:`None`, this will use the max_length value.
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:
@@ -116,11 +116,11 @@ class BartTokenizerFast(RobertaTokenizerFast):
            - **input_ids** -- List of token ids to be fed to the encoder.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **decoder_input_ids** -- List of token ids to be fed to the decoder.
-            - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder.
+            - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the
-                This does not include causal mask, which is built by the model.
+              decoder. This does not include causal mask, which is built by the model.
-            The full set of keys ``[input_ids, attention_mask, decoder_input_ids,  decoder_attention_mask]``,
+            The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, will only
-            will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
+            be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
        """
        if max_length is None:
            max_length = self.model_max_length

--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -135,15 +135,14 @@ class BertTokenizer(PreTrainedTokenizer):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
@@ -250,9 +249,8 @@ class BertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A BERT sequence has the following format:
-        A BERT sequence has the following format:
        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
@@ -307,8 +305,8 @@ class BertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
-        A BERT sequence pair mask has the following format:
+        pair mask has the following format:
        ::
@@ -383,14 +381,14 @@ class BasicTokenizer(object):
        self.strip_accents = strip_accents
    def tokenize(self, text, never_split=None):
-        """Basic Tokenization of a piece of text.
+        """
-            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
        Args:
            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
-                List of token not to split.
        """
        # union() returns a new set by concatenating the two sets.
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -512,14 +510,11 @@ class WordpieceTokenizer(object):
        self.max_input_chars_per_word = max_input_chars_per_word
    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        This uses a greedy longest-match-first algorithm to perform tokenization
+        tokenization using the given vocabulary.
-        using the given vocabulary.
-        For example:
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have

--- a/src/transformers/tokenization_bert_fast.py
+++ b/src/transformers/tokenization_bert_fast.py
@@ -130,25 +130,23 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to clean the text before tokenization by removing any control characters and
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            replacing all whitespaces by the classic one.
+            whitespaces by the classic one.
        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters.
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            This should likely be deactivated for Japanese (see `this issue
+            issue <https://github.com/huggingface/transformers/issues/328>`__).
-            <https://github.com/huggingface/transformers/issues/328>`__).
        strip_accents: (:obj:`bool`, `optional`):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for :obj:`lowercase` (as in the original BERT).
@@ -204,9 +202,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A BERT sequence has the following format:
-        A BERT sequence has the following format:
        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
@@ -231,8 +228,8 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
-        A BERT sequence pair mask has the following format:
+        pair mask has the following format:
        ::

--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@@ -94,13 +94,13 @@ class BertJapaneseTokenizer(BertTokenizer):
        mecab_kwargs=None,
        **kwargs
    ):
-        """Constructs a MecabBertTokenizer.
+        """
+        Constructs a MecabBertTokenizer.
        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input.
+                Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
-                Only has an effect when do_basic_tokenize=True.
            **do_word_tokenize**: (`optional`) boolean (default True)
                Whether to do word tokenization.
            **do_subword_tokenize**: (`optional`) boolean (default True)
@@ -205,20 +205,20 @@ class MecabTokenizer:
        mecab_dic: Optional[str] = "ipadic",
        mecab_option: Optional[str] = None,
    ):
-        """Constructs a MecabTokenizer.
+        """
+        Constructs a MecabTokenizer.
        Args:
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lowercase the input.
            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
-                List of tokens not to split.
            **normalize_text**: (`optional`) boolean (default True)
                Whether to apply unicode normalization to text before tokenization.
            **mecab_dic**: (`optional`) string (default "ipadic")
-                Name of dictionary to be used for MeCab initialization.
+                Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
-                If you are using a system-installed dictionary, set thi option to `None` and modify `mecab_option`.
+                set thi option to `None` and modify `mecab_option`.
            **mecab_option**: (`optional`) string
                String passed to MeCab constructor.
        """
@@ -306,7 +306,8 @@ class CharacterTokenizer:
    """Runs Character tokenziation."""
    def __init__(self, vocab, unk_token, normalize_text=True):
-        """Constructs a CharacterTokenizer.
+        """
+        Constructs a CharacterTokenizer.
        Args:
            **vocab**:
@@ -321,14 +322,15 @@ class CharacterTokenizer:
        self.normalize_text = normalize_text
    def tokenize(self, text):
-        """Tokenizes a piece of text into characters.
+        """
+        Tokenizes a piece of text into characters.
+        For example, :obj:`input = "apple""` wil return as output :obj:`["a", "p", "p", "l", "e"]`.
-        For example:
-            input = "apple"
-            output = ["a", "p", "p", "l", "e"]
        Args:
            text: A single token or whitespace separated tokens.
                This should have already been passed through `BasicTokenizer`.
        Returns:
            A list of characters.
        """

--- a/src/transformers/tokenization_bertweet.py
+++ b/src/transformers/tokenization_bertweet.py
@@ -50,7 +50,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
+    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
@@ -83,23 +84,22 @@ class BertweetTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -178,9 +178,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A BERTweet sequence has the following format:
-         A BERTweet sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -236,8 +235,8 @@ class BertweetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
-        BERTweet does not make use of token type ids, therefore a list of zeros is returned.
+        not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):
@@ -411,8 +410,7 @@ class BertweetTokenizer(PreTrainedTokenizer):
    def add_from_file(self, f):
        """
-        Loads a pre-existing dictionary from a text file and adds its symbols
+        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
-        to this instance.
        """
        if isinstance(f, str):
            try:
@@ -446,23 +444,17 @@ class BertweetTokenizer(PreTrainedTokenizer):
 """
-Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
+Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:
-domains and tasks. The basic logic is this:
-1. The tuple regex_strings defines a list of regular expression
+1. The tuple regex_strings defines a list of regular expression strings.
-   strings.
-2. The regex_strings strings are put, in order, into a compiled
+2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.
-   regular expression object called word_re.
-3. The tokenization is done by word_re.findall(s), where s is the
+3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
-   user-supplied string, inside the tokenize() method of the class
+   the class Tokenizer.
-   Tokenizer.
-4. When instantiating Tokenizer objects, there is a single option:
+4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
-   preserve_case.  By default, it is set to True. If it is set to
+   is set to False, then the tokenizer will downcase everything except for emoticons.
-   False, then the tokenizer will downcase everything except for
-   emoticons.
 """
@@ -582,6 +574,7 @@ REGEXPS = (
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
    # email addresses
    r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
+    # docstyle-ignore
    # Remaining word types:
    r"""
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
@@ -627,28 +620,24 @@ def _str_to_unicode(text, encoding=None, errors="strict"):
 def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
    """
-    Remove entities from text by converting them to their
+    Remove entities from text by converting them to their corresponding unicode character.
-    corresponding unicode character.
    Args:
        text:
            A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8').
        keep (list):
-            List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
+            List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and
-            and named entities (such as ``&nbsp;`` or ``&gt;``).
+            ``&#hhhh;``) and named entities (such as ``&nbsp;`` or ``&gt;``).
        remove_illegal (bool):
-            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is".
+            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
+            kept "as is".
    Returns: A unicode string with the entities removed.
    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
-        >>> from nltk.tokenize.casual import _replace_html_entities
+        >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: &pound;100')
-        >>> _replace_html_entities(b'Price: &pound;100')
+        'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: &pound;100')) Price: £100 >>>
-        'Price: \\xa3100'
-        >>> print(_replace_html_entities(b'Price: &pound;100'))
-        Price: £100
-        >>>
    """
    def _convert_entity(match):
@@ -714,8 +703,8 @@ class TweetTokenizer:
        Args:
            text: str
-        Returns: list(str)
+        Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
-            A tokenized list of strings; concatenating this list returns the original string if `preserve_case=False`
+        `preserve_case=False`
        """
        # Fix HTML character entities:
        text = _replace_html_entities(text)
@@ -742,8 +731,7 @@ class TweetTokenizer:
 def reduce_lengthening(text):
    """
-    Replace repeated character sequences of length 3 or greater with sequences
+    Replace repeated character sequences of length 3 or greater with sequences of length 3.
-    of length 3.
    """
    pattern = regex.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1\1", text)