Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/tokenization_pegasus_fast.py
+++ b/src/transformers/tokenization_pegasus_fast.py
@@ -70,8 +70,9 @@ class PegasusTokenizerFast(ReformerTokenizerFast):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """
        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>``  (not intended use)
+        - pair of sequences: ``A B </s>`` (not intended use)
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_phobert.py
+++ b/src/transformers/tokenization_phobert.py
@@ -50,7 +50,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
+    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
@@ -68,8 +69,8 @@ class PhobertTokenizer(PreTrainedTokenizer):
    """
    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    methods. Users should refer to this superclass for more information regarding those methods.
+    Users should refer to this superclass for more information regarding those methods.
    Args:
        vocab_file (:obj:`str`):
@@ -81,23 +82,22 @@ class PhobertTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -159,9 +159,8 @@ class PhobertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A PhoBERT sequence has the following format:
-        A PhoBERT sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -217,8 +216,8 @@ class PhobertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
-        PhoBERT does not make use of token type ids, therefore a list of zeros is returned.
+        make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):
@@ -338,8 +337,7 @@ class PhobertTokenizer(PreTrainedTokenizer):
    def add_from_file(self, f):
        """
-        Loads a pre-existing dictionary from a text file and adds its symbols
+        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
-        to this instance.
        """
        if isinstance(f, str):
            try:

--- a/src/transformers/tokenization_prophetnet.py
+++ b/src/transformers/tokenization_prophetnet.py
@@ -73,18 +73,18 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`):
-            Special second separator token, which can be generated by :class:`~transformers.ProphetNetForConditionalGeneration`.
+            Special second separator token, which can be generated by
-            It is used to separate bullet-point like sentences in summarization, *e.g.*.
+            :class:`~transformers.ProphetNetForConditionalGeneration`. It is used to separate bullet-point like
+            sentences in summarization, *e.g.*.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
@@ -218,8 +218,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
-        A ProphetNet sequence pair mask has the following format:
+        sequence pair mask has the following format:
        ::
@@ -267,9 +267,8 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A BERT sequence has the following format:
-        A BERT sequence has the following format:
        - single sequence: ``[CLS] X [SEP]``
        - pair of sequences: ``[CLS] A [SEP] B [SEP]``

--- a/src/transformers/tokenization_reformer.py
+++ b/src/transformers/tokenization_reformer.py
@@ -70,8 +70,8 @@ class ReformerTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.

--- a/src/transformers/tokenization_reformer_fast.py
+++ b/src/transformers/tokenization_reformer_fast.py
@@ -79,8 +79,8 @@ class ReformerTokenizerFast(PreTrainedTokenizerFast):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.

--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -79,7 +79,8 @@ class RobertaTokenizer(GPT2Tokenizer):
    .. note::
-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
+        one).
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
@@ -97,23 +98,22 @@ class RobertaTokenizer(GPT2Tokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -176,9 +176,8 @@ class RobertaTokenizer(GPT2Tokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A RoBERTa sequence has the following format:
-        A RoBERTa sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -232,8 +231,8 @@ class RobertaTokenizer(GPT2Tokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+        make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):
@@ -242,7 +241,7 @@ class RobertaTokenizer(GPT2Tokenizer):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`:  List of zeros.
+            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/tokenization_roberta_fast.py
+++ b/src/transformers/tokenization_roberta_fast.py
@@ -103,23 +103,22 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -184,8 +183,8 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+        make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):
@@ -194,7 +193,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
                Optional second list of IDs for sequence pairs.
        Returns:
-            :obj:`List[int]`:  List of zeros.
+            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

--- a/src/transformers/tokenization_squeezebert.py
+++ b/src/transformers/tokenization_squeezebert.py
@@ -46,7 +46,7 @@ PRETRAINED_INIT_CONFIGURATION = {
 class SqueezeBertTokenizer(BertTokenizer):
    r"""
-    Constructs a  SqueezeBert tokenizer.
+    Constructs a SqueezeBert tokenizer.
    :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
    tokenization: punctuation splitting + wordpiece.

--- a/src/transformers/tokenization_squeezebert_fast.py
+++ b/src/transformers/tokenization_squeezebert_fast.py
@@ -52,10 +52,10 @@ PRETRAINED_INIT_CONFIGURATION = {
 class SqueezeBertTokenizerFast(BertTokenizerFast):
    r"""
-    Constructs a  "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library).
-    :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
+    :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
-    runs end-to-end tokenization: punctuation splitting + wordpiece.
+    end-to-end tokenization: punctuation splitting + wordpiece.
    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
    parameters.

--- a/src/transformers/tokenization_t5.py
+++ b/src/transformers/tokenization_t5.py
@@ -79,18 +79,18 @@ class T5Tokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (:obj:`int`, `optional`, defaults to 100):
-            Add a number of extra ids added to the end of the vocabulary for use as sentinels.
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
-            These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
-            Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
+            indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary
-            in the vocabulary like in T5 preprocessing see `here
+            like in T5 preprocessing see `here
            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
        additional_special_tokens (:obj:`List[str]`, `optional`):
            Additional special tokens used by the tokenizer.
@@ -191,9 +191,8 @@ class T5Tokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A sequence has the following format:
-        A sequence has the following format:
        - single sequence: ``X </s>``
        - pair of sequences: ``A </s> B </s>``

--- a/src/transformers/tokenization_t5_fast.py
+++ b/src/transformers/tokenization_t5_fast.py
@@ -90,18 +90,18 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (:obj:`int`, `optional`, defaults to 100):
-            Add a number of extra ids added to the end of the vocabulary for use as sentinels.
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
-            These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1.
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
-            Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token
+            indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary
-            in the vocabulary like in T5 preprocessing see `here
+            like in T5 preprocessing see `here
            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
        additional_special_tokens (:obj:`List[str]`, `optional`):
            Additional special tokens used by the tokenizer.
@@ -169,9 +169,8 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A sequence has the following format:
-        A sequence has the following format:
        - single sequence: ``X </s>``
        - pair of sequences: ``A </s> B </s>``

--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -13,8 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for Transformer XL model.
+"""
-    Adapted from https://github.com/kimiyoung/transformer-xl.
+ Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl.
 """
@@ -67,12 +67,15 @@ DETOKENIZE_NUMBERS = [(r" @\,@ ", r","), (r" @\.@ ", r".")]
 def tokenize_numbers(text_array: List[str]) -> List[str]:
    """
-    Splits large comma-separated numbers and floating point values.
+    Splits large comma-separated numbers and floating point values. This is done by replacing commas with ' @,@ ' and
-    This is done by replacing commas with ' @,@ ' and dots with ' @.@ '.
+    dots with ' @.@ '.
    Args:
-        text_array: An already tokenized text as list
+        text_array: An already tokenized text as list.
    Returns:
-        A list of strings with tokenized numbers
+        A list of strings with tokenized numbers.
    Example::
        >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
        ["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
@@ -88,12 +91,14 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
 def detokenize_numbers(text: str) -> str:
    """
-    Inverts the operation of `tokenize_numbers`.
+    Inverts the operation of `tokenize_numbers`. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
-    This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
    Args:
-        text: A string where the number should be detokenized
+        text: A string where the number should be detokenized.
    Returns:
-        A detokenized string
+        A detokenized string.
    Example::
        >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
        "$ 5,000 1.73 m"
@@ -106,7 +111,8 @@ def detokenize_numbers(text: str) -> str:
 class TransfoXLTokenizer(PreTrainedTokenizer):
    """
    Construct a Transformer-XL tokenizer adapted from Vocab class in `the original code
-    <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a word-level tokenizer (no sub-word tokenization).
+    <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a word-level tokenizer (no
+    sub-word tokenization).
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.
@@ -129,8 +135,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        pretrained_vocab_file (:obj:`str`, `optional`):
            File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
        never_split (:obj:`List[str]`, `optional`):
-            List of tokens that should never be split. If no list is specified, will simply use the existing
+            List of tokens that should never be split. If no list is specified, will simply use the existing special
-            special tokens.
+            tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -366,9 +372,9 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
    def move_added_token(self, token: str, target_idx: int):
        """
-        Moves an added token to a specific position in the vocab.
+        Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding
-        This method should be used when resizing an embedding layer other than the last one in the `AdaptiveEmbedding`
+        layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the
-        in order to move the token in the tokenizer from the default position (at the very end) to the desired one.
+        default position (at the very end) to the desired one.
        Args:
            token: The token to move to a specific position in the vocab.
@@ -402,13 +408,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
    def moses_pipeline(self, text: str) -> List[str]:
        """
        Does basic tokenization using :class:`sacremoses.MosesPunctNormalizer` and :class:`sacremoses.MosesTokenizer`
-        with `aggressive_dash_splits=True` (see :func:`sacremoses.tokenize.MosesTokenizer.tokenize`).
+        with `aggressive_dash_splits=True` (see :func:`sacremoses.tokenize.MosesTokenizer.tokenize`). Additionally,
-        Additionally, large comma-separated numbers and floating point values are split.
+        large comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23
-        E.g. "23,000 people are 1.80m tall" -> "23 @,@ 000 people are 1 @.@ 80m tall".
+        @,@ 000 people are 1 @.@ 80m tall"
        Args:
-            text: Text to be tokenized
+            text: Text to be tokenize
        Returns:
-            A list of tokenized strings
+            A list of tokenized string
        Example::
            >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
            >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
@@ -443,8 +452,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
    def convert_tokens_to_string(self, tokens):
        """
-        Converts a sequence of tokens (string) in a single string.
+        Converts a sequence of tokens (string) in a single string. Additionally, the split numbers are converted back
-        Additionally, the split numbers are converted back into it's original form.
+        into it's original form.
        """
        out_string = self.moses_detokenizer.detokenize(tokens)
        return detokenize_numbers(out_string).strip()

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for python tokenizers.
+"""
-    For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
+ Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
+ tokenization_utils_fast.py
 """
 import itertools
 import re
@@ -108,12 +109,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
-    Handle all the shared methods for tokenization and special tokens as well as methods
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
-    downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+    pretrained tokenizers as well as adding tokens to the vocabulary.
-    This class also contain the added tokens in a unified way on top of all tokenizers so we don't
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
-    have to handle the specific vocabulary augmentation methods of the various underlying
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
-    dictionary structures (BPE, sentencepiece...).
    """
    def __init__(self, **kwargs):
@@ -153,8 +153,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+        it with indices starting from length of the current vocabulary.
        Args:
            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
@@ -231,11 +231,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        """
        Converts a string in a sequence of tokens, using the tokenizer.
-        Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method
+        Note that, unlike Fast tokenizers (instances of PreTrainedTokenizerFast), this method won't replace the unknown
-        won't replace the unknown tokens with the `unk_token` yet (this is done in the `encode()` method)
+        tokens with the `unk_token` yet (this is done in the `encode()` method)
-        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
-        Takes care of added tokens.
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
        Args:
            text (:obj:`str`):
@@ -354,9 +354,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
    def _tokenize(self, text, **kwargs):
        """
-        Converts a string in a sequence of tokens (string), using the tokenizer.
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
-        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
-        (BPE/SentencePieces/WordPieces).
        Do NOT take care of added tokens.
        """
@@ -589,8 +588,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        verbose: bool = True,
    ) -> BatchEncoding:
        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens
        Args:
@@ -641,8 +640,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        """
        Performs any necessary transformations before tokenization.
-        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well.
+        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
-        We test the :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
        Args:
            test (:obj:`str`):
@@ -689,8 +688,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        """
-        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
-        and added tokens.
+        added tokens.
        Args:
            ids (:obj:`int` or :obj:`List[int]`):
@@ -782,13 +781,13 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            tgt_texts: (:obj:`List[str]`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts).
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts). If
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
-                Controls the maximum length of decoder inputs (target language texts or summaries).
+                Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or
-                If left unset or set to :obj:`None`, this will use the max_length value.
+                set to :obj:`None`, this will use the max_length value.
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:
@@ -829,8 +828,8 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts
-            The full set of keys ``[input_ids, attention_mask, labels]``,
+            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
-            will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
+            Otherwise, input_ids, attention_mask will be the only keys.
        """
        raise NotImplementedError(
            "If your model requires more than input_ids for a typical forward pass, you should implement this method. "

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -12,10 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Base classes common to both the slow and the fast tokenization classes:
+"""
-    PreTrainedTokenizerBase (host all the user fronting encoding methodes)
+Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
-    Special token mixing (host the special tokens logic) and
+fronting encoding methodes) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
-    BatchEncoding (wrap the dictionary of output with special method for the Fast tokenizers)
+of output with special method for the Fast tokenizers)
 """
 import copy
@@ -58,8 +58,9 @@ else:
    @dataclass(frozen=True, eq=True)
    class AddedToken:
-        """AddedToken represents a token to be added to a Tokenizer
+        """
-        An AddedToken can have special options defining the way it should behave.
+        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
+        way it should behave.
        """
        content: str = field(default_factory=str)
@@ -116,8 +117,8 @@ class ExplicitEnum(Enum):
 class TruncationStrategy(ExplicitEnum):
    """
-    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
+    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
-    Useful for tab-completion in an IDE.
+    tab-completion in an IDE.
    """
    ONLY_FIRST = "only_first"
@@ -128,8 +129,8 @@ class TruncationStrategy(ExplicitEnum):
 class PaddingStrategy(ExplicitEnum):
    """
-    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
-    Useful for tab-completion in an IDE.
+    in an IDE.
    """
    LONGEST = "longest"
@@ -139,8 +140,8 @@ class PaddingStrategy(ExplicitEnum):
 class TensorType(ExplicitEnum):
    """
-    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`.
+    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
-    Useful for tab-completion in an IDE.
+    tab-completion in an IDE.
    """
    PYTORCH = "pt"
@@ -177,8 +178,7 @@ class TokenSpan(NamedTuple):
 def to_py_obj(obj):
    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
-    to a python list.
    """
    if isinstance(obj, (list, tuple)):
        return [to_py_obj(o) for o in obj]
@@ -194,8 +194,8 @@ def to_py_obj(obj):
 class BatchEncoding(UserDict):
    """
-    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`
+    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
-    and :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
+    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
    attention_masks, etc).
    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
@@ -242,8 +242,8 @@ class BatchEncoding(UserDict):
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids',
+        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
-        'attention_mask', etc.).
+        etc.).
        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
        """
@@ -289,15 +289,15 @@ class BatchEncoding(UserDict):
    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
-        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process.
+        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
-        Returns :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
+        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
        """
        return self._encodings
    def tokens(self, batch_index: int = 0) -> List[str]:
        """
-        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion
+        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
-        to integer indices) at a given batch index (only works for the output of a fast tokenizer).
+        integer indices) at a given batch index (only works for the output of a fast tokenizer).
        Args:
            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
@@ -327,25 +327,24 @@ class BatchEncoding(UserDict):
    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
-        Get the index of the word corresponding (i.e. comprising) to an encoded token
+        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
-        in a sequence of the batch.
        Can be called as:
        - ``self.token_to_word(token_index)`` if batch size is 1
        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
-        This method is particularly suited when the input sequences are provided as
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
-        pre-tokenized sequences (i.e., words are defined by the user). In this case it allows
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
-        to easily associate encoded tokens with provided tokenized words.
+        tokenized words.
        Args:
            batch_or_token_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                this can be the index of the token in the sequence.
+                the token in the sequence.
            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
-                of the token in the sequence.
+                sequence.
        Returns:
            :obj:`int`: Index of the word in the input sequence.
@@ -378,22 +377,21 @@ class BatchEncoding(UserDict):
        - ``self.word_to_tokens(word_index)`` if batch size is 1
        - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1
-        This method is particularly suited when the input sequences are provided as
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
-        to easily associate encoded tokens with provided tokenized words.
+        words.
        Args:
            batch_or_word_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprises one sequence,
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
-                this can be the index of the word in the sequence.
+                the word in the sequence.
            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
-                of the word in the sequence.
+                sequence.
        Returns:
-            Optional :class:`~transformers.tokenization_utils_base.TokenSpan`
+            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
-            Span of tokens in the encoded sequence. Returns :obj:`None` if no tokens correspond
+            Returns :obj:`None` if no tokens correspond to the word.
-            to the word.
        """
        if not self._encodings:
@@ -427,15 +425,14 @@ class BatchEncoding(UserDict):
        Args:
            batch_or_token_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                this can be the index of the token in the sequence.
+                the token in the sequence.
            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
-                of the token or tokens in the sequence.
+                the sequence.
        Returns:
-            :class:`~transformers.tokenization_utils_base.CharSpan`:
+            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
-            Span of characters in the original string.
        """
        if not self._encodings:
@@ -449,25 +446,25 @@ class BatchEncoding(UserDict):
    def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
        """
-        Get the index of the token in the encoded output comprising a character
+        Get the index of the token in the encoded output comprising a character in the original string for a sequence
-        in the original string for a sequence of the batch.
+        of the batch.
        Can be called as:
        - ``self.char_to_token(char_index)`` if batch size is 1
        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
-        This method is particularly suited when the input sequences are provided as
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
-        to easily associate encoded tokens with provided tokenized words.
+        words.
        Args:
            batch_or_char_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                this can be the index of the word in the sequence
+                the word in the sequence
            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
-                of the word in the sequence.
+                sequence.
        Returns:
@@ -485,8 +482,7 @@ class BatchEncoding(UserDict):
    def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan:
        """
-        Get the character span in the original string corresponding to given word in a sequence
+        Get the character span in the original string corresponding to given word in a sequence of the batch.
-        of the batch.
        Character spans are returned as a CharSpan NamedTuple with:
@@ -500,19 +496,19 @@ class BatchEncoding(UserDict):
        Args:
            batch_or_word_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                this can be the index of the word in the sequence
+                the word in the sequence
            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
-                of the word in the sequence.
+                sequence.
        Returns:
-            :obj:`CharSpan` or :obj:`List[CharSpan]`:
+            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
-                Span(s) of the associated character or characters in the string.
+            CharSpan are NamedTuple with:
-                CharSpan are NamedTuple with:
                - start: index of the first character associated to the token in the original string
-                - end: index of the character following the last character associated to the token in the original string
+                - end: index of the character following the last character associated to the token in the original
+                  string
        """
        if not self._encodings:
@@ -526,30 +522,29 @@ class BatchEncoding(UserDict):
    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int:
        """
-        Get the word in the original string corresponding to a character in the original string of
+        Get the word in the original string corresponding to a character in the original string of a sequence of the
-        a sequence of the batch.
+        batch.
        Can be called as:
        - ``self.char_to_word(char_index)`` if batch size is 1
        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
-        This method is particularly suited when the input sequences are provided as
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
-        pre-tokenized sequences (i.e. words are defined by the user). In this case it allows
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
-        to easily associate encoded tokens with provided tokenized words.
+        words.
        Args:
            batch_or_char_index (:obj:`int`):
-                Index of the sequence in the batch. If the batch only comprise one sequence,
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
-                this can be the index of the character in the orginal string.
+                the character in the orginal string.
            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
-                of the character in the orginal string.
+                orginal string.
        Returns:
-            :obj:`int` or :obj:`List[int]`:
+            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
-                Index or indices of the associated encoded token(s).
        """
        if not self._encodings:
@@ -642,8 +637,8 @@ class BatchEncoding(UserDict):
            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
        Returns:
-            :class:`~transformers.BatchEncoding`:
+            :class:`~transformers.BatchEncoding`: The same instance of :class:`~transformers.BatchEncoding` after
-            The same instance of :class:`~transformers.BatchEncoding` after modification.
+            modification.
        """
        self.data = {k: v.to(device) for k, v in self.data.items()}
        return self
@@ -651,8 +646,8 @@ class BatchEncoding(UserDict):
 class SpecialTokensMixin:
    """
-    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
+    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
-    to handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
+    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
    used to directly access these special tokens in a model-independant manner and allow to set and update the special
    tokens.
@@ -874,8 +869,8 @@ class SpecialTokensMixin:
    @property
    def sep_token(self) -> str:
        """
-        :obj:`str`: Separation token, to separate context and query in an input sequence.
+        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
-        Log an error if used while not having been set.
+        not having been set.
        """
        if self._sep_token is None and self.verbose:
            logger.error("Using sep_token, but it is not set yet.")
@@ -895,8 +890,8 @@ class SpecialTokensMixin:
    @property
    def cls_token(self) -> str:
        """
-        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along
+        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
-        the full depth of the model. Log an error if used while not having been set.
+        full depth of the model. Log an error if used while not having been set.
        """
        if self._cls_token is None and self.verbose:
            logger.error("Using cls_token, but it is not set yet.")
@@ -1039,8 +1034,8 @@ class SpecialTokensMixin:
    @property
    def additional_special_tokens_ids(self) -> List[int]:
        """
-        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary.
+        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
-        Log an error if used while not having been set.
+        having been set.
        """
        return self.convert_tokens_to_ids(self.additional_special_tokens)
@@ -1079,8 +1074,8 @@ class SpecialTokensMixin:
    @property
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
-        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes
+        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
-        (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
        """
@@ -1199,8 +1194,8 @@ ENCODE_KWARGS_DOCSTRING = r"""
 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
            return_token_type_ids (:obj:`bool`, `optional`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
                `What are token type IDs? <../glossary.html#token-type-ids>`__
            return_attention_mask (:obj:`bool`, `optional`):
@@ -1230,14 +1225,17 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
            - **input_ids** -- List of token ids to be fed to a model.
              `What are input IDs? <../glossary.html#input-ids>`__
            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
              `What are token type IDs? <../glossary.html#token-type-ids>`__
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
              `What are attention masks? <../glossary.html#attention-mask>`__
            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
              :obj:`return_overflowing_tokens=True`).
            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
@@ -1249,6 +1247,7 @@ ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
 INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)
        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
          each vocabulary file required by the model, and as associated values, the filename for saving the associated
          file (string).
@@ -1260,8 +1259,8 @@ INIT_TOKENIZER_DOCSTRING = r"""
          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
          inputs of this model, or :obj:`None` if the model has no maximum input size.
        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
-          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific
+          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
-          arguments to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
+          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
          method.
        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
@@ -1270,11 +1269,10 @@ INIT_TOKENIZER_DOCSTRING = r"""
    Args:
        model_max_length (:obj:`int`, `optional`):
-            The maximum length (in number of tokens) for the inputs to the transformer model.
+            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
-            When the tokenizer is loaded with
+            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this will be set to
+            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
-            the value stored for the associated model in ``max_model_input_sizes`` (see above). If no value is
+            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
-            provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
        padding_side: (:obj:`str`, `optional`):
            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
            Default value is picked from the class attribute of the same name.
@@ -1319,13 +1317,13 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
            tgt_texts (:obj:`list`, `optional`):
                List of summaries or target language texts.
            max_length (:obj:`int`, `optional`):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts)
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
-                length is required by one of the truncation/padding parameters. If the model has no specific maximum
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            max_target_length (:obj:`int`, `optional`):
-                Controls the maximum length of decoder inputs (target language texts or summaries)
+                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
-                If left unset or set to :obj:`None`, this will use the max_length value.
+                to :obj:`None`, this will use the max_length value.
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:
@@ -1366,8 +1364,8 @@ PREPARE_SEQ2SEQ_BATCH_DOCSTRING = """
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
            - **labels** -- List of token ids for tgt_texts.
-            The full set of keys ``[input_ids, attention_mask, labels]``,
+            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
-            will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys.
+            Otherwise, input_ids, attention_mask will be the only keys.
 """
@@ -1515,9 +1513,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                exists.
            proxies (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-                request.
            inputs (additional positional arguments, `optional`):
                Will be passed along to the Tokenizer ``__init__`` method.
            kwargs (additional keyword arguments, `optional`):
@@ -1792,10 +1789,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.
        .. Note::
-            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with
+            A "fast" tokenizer (instance of :class:`transformers.PreTrainedTokenizerFast`) saved with this method will
-            this method will not be possible to load back
+            not be possible to load back in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer`
-            in a "slow" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizer` instance. It can only be loaded
+            instance. It can only be loaded in a "fast" tokenizer, i.e. in a
-            in a "fast" tokenizer, i.e. in a :class:`transformers.PreTrainedTokenizerFast` instance.
+            :class:`transformers.PreTrainedTokenizerFast` instance.
        .. Warning::
           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
@@ -1804,10 +1801,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
            legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and
+                Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and a
-                a separate added_tokens files or in the unified JSON file format for the `tokenizers` library.
+                separate added_tokens files or in the unified JSON file format for the `tokenizers` library. It's only
-                It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible
+                possible to save a Fast tokenizer in the unified JSON format and this format is incompatible with
-                with "slow" tokenizers (not powered by the `tokenizers` library).
+                "slow" tokenizers (not powered by the `tokenizers` library).
            filename_prefix: (:obj:`str`, `optional`):
                A prefix to add to the names of the files saved by the tokenizer.
@@ -1871,10 +1868,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        legacy_format: bool = True,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
-        """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
+        """
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
-        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
+        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
-        using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
+        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
        """
        if not legacy_format:
            raise ValueError(
@@ -1898,9 +1896,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).
-        This method won't save the configuration and special token mappings of the tokenizer.
+        This method won't save the configuration and special token mappings of the tokenizer. Use
-        Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save
+        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
-        the whole state of the tokenizer.
        Args:
            save_directory (:obj:`str`):
@@ -1918,10 +1915,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Converts a string in a sequence of tokens, using the backend Rust tokenizer.
        Note that this method behave differently between fast and slow tokenizers:
-            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method
-                will replace the unknown tokens with the :obj:`unk_token`,
+            - in fast tokenizers (instances of :class:`~transformers.PreTrainedTokenizerFast`), this method will
-            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method
+              replace the unknown tokens with the :obj:`unk_token`,
-                keep unknown tokens unchanged.
+            - in slow tokenizers (instances of :class:`~transformers.PreTrainedTokenizer`), this method keep unknown
+              tokens unchanged.
        Args:
            text (:obj:`str`):
@@ -1931,8 +1929,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not to add the special tokens associated with the corresponding model.
            kwargs (additional keyword arguments, `optional`):
-                Will be passed to the underlying model specific encode method.
+                Will be passed to the underlying model specific encode method. See details in
-                See details in :meth:`~transformers.PreTrainedTokenizer.__call__`
+                :meth:`~transformers.PreTrainedTokenizer.__call__`
        Returns:
            :obj:`List[str]`: The list of tokens.
@@ -1946,8 +1944,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        """,
        """
        Returns:
-            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`:
+            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
-            The tokenized ids of the text.
+            text.
        """,
    )
    def encode(
@@ -1969,12 +1967,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
-                ``convert_tokens_to_ids`` method).
+                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
        encoded_inputs = self.encode_plus(
@@ -1998,8 +1996,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
    ):
        """
-        Find the correct padding/truncation strategy with backward compatibility
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
-        for old arguments (truncation_strategy and pad_to_max_length) and behaviors.
+        and pad_to_max_length) and behaviors.
        """
        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
@@ -2150,14 +2148,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                Each sequence can be a string or a list of strings (pretokenized string).
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                If the sequences are provided as list of strings (pretokenized), you must set
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                The sequence or batch of sequences to be encoded.
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                Each sequence can be a string or a list of strings (pretokenized string).
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                If the sequences are provided as list of strings (pretokenized), you must set
                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
        """
        # Input type checking for clearer error
@@ -2276,12 +2272,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
-                ``convert_tokens_to_ids`` method).
+                method).
            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                string using the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
                ``convert_tokens_to_ids`` method).
        """
@@ -2375,9 +2371,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
-                Batch of sequences or pair of sequences to be encoded.
+                Batch of sequences or pair of sequences to be encoded. This can be a list of
-                This can be a list of string/string-sequences/int-sequences or a list of pair of
+                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
-                string/string-sequences/int-sequence (see details in ``encode_plus``).
+                details in ``encode_plus``).
        """
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2459,8 +2455,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
        in the batch.
-        Padding side (left/right) padding token ids are defined at the tokenizer level
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
-        (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``)
+        ``self.pad_token_id`` and ``self.pad_token_type_id``)
        .. note::
@@ -2470,10 +2466,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or
+                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
-                :obj:`Dict[str, List[int]]`) or a batch of tokenized inputs (list of
+                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
-                :class:`~transformers.BatchEncoding`, `Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`) so
+                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
-                you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.
+                well as in a PyTorch Dataloader collate function.
                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                see the note above for the return type.
@@ -2592,8 +2588,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create the token type IDs corresponding to the sequences passed.
+        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
-        `What are token type IDs? <../glossary.html#token-type-ids>`__
+        <../glossary.html#token-type-ids>`__
        Should be overriden in a subclass if the model has a special way of building those.
@@ -2612,8 +2608,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens.
        This implementation does not add special tokens and this method should be overriden in a subclass.
@@ -2651,17 +2647,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        **kwargs
    ) -> BatchEncoding:
        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens
        Args:
            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                and ``convert_tokens_to_ids`` methods.
            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                and ``convert_tokens_to_ids`` methods.
        """
        if "return_lengths" in kwargs:
@@ -2780,28 +2776,28 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        Args:
            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                and ``convert_tokens_to_ids`` methods.
            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                ``tokenize`` and ``convert_tokens_to_ids`` methods.
+                and ``convert_tokens_to_ids`` methods.
            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
                Number of tokens to remove using the truncation strategy.
            truncation (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
                The strategy to follow for truncation. Can be:
-                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
+                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  if a pair of sequences (or a batch of pairs) is provided.
+                  sequences (or a batch of pairs) is provided.
                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
                  the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
                  to the maximum acceptable input length for the model if that argument is not provided. This will only
                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  sequence lengths greater than the model maximum admissible input size).
+                  greater than the model maximum admissible input size).
            max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.
@@ -2809,12 +2805,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            stride (:obj:`int`, `optional`, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
+                sequence returned. The value of this argument defines the number of additional tokens.
        Returns:
-            :obj:`Tuple[List[int], List[int], List[int]]`:
+            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
-            The truncated ``ids``, the truncated ``pair_ids`` and the list of overflowing tokens.
+            list of overflowing tokens.
        """
        if num_tokens_to_remove <= 0:
            return ids, pair_ids, []
@@ -2882,10 +2878,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
@@ -2939,9 +2937,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
-        Converts a sequence of token ids in a single string.
+        Converts a sequence of token ids in a single string. The most simple way to do it is ``" ".join(tokens)`` but
-        The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
+        we often want to remove sub-word tokenization artifacts at the same time
-        sub-word tokenization artifacts at the same time.
        Args:
            tokens (:obj:`List[str]`): The token to join in a string.
        Return: The joined tokens.
@@ -2989,8 +2987,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        **kwargs
    ) -> str:
        """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
-        with options to remove special tokens and clean up tokenization spaces.
+        tokens and clean up tokenization spaces.
        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.

--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library).
+"""
-    For slow (python) tokenizers see tokenization_utils.py
+ Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
+ see tokenization_utils.py
 """
 import json
@@ -71,9 +72,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
-    This class also contains the added tokens in a unified way on top of all tokenizers so we don't
+    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
-    have to handle the specific vocabulary augmentation methods of the various underlying
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
-    dictionary structures (BPE, sentencepiece...).
    """
    slow_tokenizer_class: PreTrainedTokenizer = None
@@ -170,10 +170,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return_length: bool = False,
        verbose: bool = True,
    ) -> Dict[str, Any]:
-        """Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
+        """
+        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict.
-        Overflowing tokens are converted to additional examples (like batches) so the output values of
+        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
-        the dict are lists (overflows) of lists (tokens).
+        lists (overflows) of lists (tokens).
        Output shape: (overflows, sequence length)
        """
@@ -263,8 +264,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        """
-        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
-        and added tokens.
+        added tokens.
        Args:
            ids (:obj:`int` or :obj:`List[int]`):
@@ -511,10 +512,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        legacy_format: bool = True,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
-        """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
+        """
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
-        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens}
+        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
-        using the specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
+        specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained`
        """
        if legacy_format:
            added_tokens_file = os.path.join(

--- a/src/transformers/tokenization_xlm.py
+++ b/src/transformers/tokenization_xlm.py
@@ -429,8 +429,8 @@ PRETRAINED_INIT_CONFIGURATION = {
 def get_pairs(word):
    """
-    Return set of symbol pairs in a word.
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
-    word is represented as tuple of symbols (symbols being variable-length strings)
+    strings)
    """
    pairs = set()
    prev_char = word[0]
@@ -556,18 +556,17 @@ class XLMTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<special1>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
@@ -750,35 +749,44 @@ class XLMTokenizer(PreTrainedTokenizer):
    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
        """
-        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific
+        tokenizerself. Otherwise, we use Moses.
        Details of tokenization:
-        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`
-        - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
+            - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
            - Install with `pip install pythainlp`
-        - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea)
+            - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of
+              [KyTea](https://github.com/neubig/kytea)
            - Install with the following steps:
-            ```
-            git clone git@github.com:neubig/kytea.git && cd kytea
+            ::
-            autoreconf -i
-            ./configure --prefix=$HOME/local
+                git clone git@github.com:neubig/kytea.git && cd kytea
-            make && make install
+                autoreconf -i
-            pip install kytea
+                ./configure --prefix=$HOME/local
-            ```
+                make && make install
-        - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
+                pip install kytea
+            - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
            - Install with `pip install jieba`
-        (*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
+        (*) The original XLM used [Stanford
-        However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
+        Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
-        Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
+        (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. Jieba is a lot
-        if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
+        faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine if you
-        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally,
+        fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
-        and set `bypass_tokenizer=True` to bypass the tokenizer.
+        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence
+        externally, and set `bypass_tokenizer=True` to bypass the tokenizer.
        Args:
-            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
-            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
+              languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
        Returns:
            List of tokens.
@@ -855,9 +863,8 @@ class XLMTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An XLM sequence has the following format:
-        An XLM sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s> B </s>``
@@ -919,8 +926,8 @@ class XLMTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
-        An XLM sequence pair mask has the following format:
+        pair mask has the following format:
        ::

--- a/src/transformers/tokenization_xlm_prophetnet.py
+++ b/src/transformers/tokenization_xlm_prophetnet.py
@@ -70,23 +70,22 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -98,9 +97,8 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
-    Attributes:
+    Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
-        sp_model (:obj:`SentencePieceProcessor`):
+    conversion (string, tokens and IDs).
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -216,8 +214,8 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet
-        XLMProphetNet does not make use of token type ids, therefore a list of zeros is returned.
+        does not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):
@@ -285,9 +283,8 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A XLMProphetNet sequence has the following format:
-        A XLMProphetNet sequence has the following format:
        - single sequence: ``X [SEP]``
        - pair of sequences: ``A [SEP] B [SEP]``

--- a/src/transformers/tokenization_xlm_roberta.py
+++ b/src/transformers/tokenization_xlm_roberta.py
@@ -68,23 +68,22 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -96,9 +95,8 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
-    Attributes:
+    Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
-        sp_model (:obj:`SentencePieceProcessor`):
+    conversion (string, tokens and IDs).
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -162,9 +160,8 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
-        An XLM-RoBERTa sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -220,8 +217,8 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+        not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/tokenization_xlm_roberta_fast.py
@@ -80,23 +80,22 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -108,9 +107,8 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
-    Attributes:
+    Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
-        sp_model (:obj:`SentencePieceProcessor`):
+    conversion (string, tokens and IDs).
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -151,9 +149,8 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
-        An XLM-RoBERTa sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -209,8 +206,8 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
+        not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_xlnet.py
+++ b/src/transformers/tokenization_xlnet.py
@@ -73,28 +73,27 @@ class XLNetTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
@@ -227,9 +226,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An XLNet sequence has the following format:
-        An XLNet sequence has the following format:
        - single sequence: ``X <sep> <cls>``
        - pair of sequences: ``A <sep> B <sep> <cls>``
@@ -284,8 +282,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
-        An XLNet sequence pair mask has the following format:
+        sequence pair mask has the following format:
        ::