Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy

Doc styling (#8067)
* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
08f534d2 · Sylvain Gugger · GitHub · 04a17f85 · 08f534d2 · 08f534d2
Unverified Commit 08f534d2 authored Oct 26, 2020 by Sylvain Gugger Committed by GitHub Oct 26, 2020
20 changed files
--- a/src/transformers/tokenization_blenderbot.py
+++ b/src/transformers/tokenization_blenderbot.py
@@ -42,8 +42,8 @@ class BlenderbotTokenizer(RobertaTokenizer):
    Construct a Blenderbot tokenizer.
    :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS
+    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
-    token to the beginning of sequences.
+    to the beginning of sequences.
    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
    parameters.
@@ -62,9 +62,8 @@ class BlenderbotTokenizer(RobertaTokenizer):
    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A Blenderbot sequence has the following format:
-        A Blenderbot sequence has the following format:
        - single sequence: `` X </s>``
@@ -81,7 +80,8 @@ class BlenderbotTokenizer(RobertaTokenizer):
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
+    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
@@ -99,8 +99,8 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
    """
    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. Users
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    should refer to the superclass for more information regarding methods.
+    Users should refer to the superclass for more information regarding methods.
    Args:
        vocab_file (:obj:`str`):
@@ -112,11 +112,12 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
            The end of sentence token.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead.
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
            The token used for padding, for example when batching sequences of different lengths.
        **kwargs
-            Additional keyword arguments passed along to  :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
    """
    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}

--- a/src/transformers/tokenization_camembert.py
+++ b/src/transformers/tokenization_camembert.py
@@ -66,23 +66,22 @@ class CamembertTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the end
+                When building a sequence using special tokens, this is not the token that is used for the end of
-                of sequence. The token used is the :obj:`sep_token`.
+                sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
@@ -94,9 +93,8 @@ class CamembertTokenizer(PreTrainedTokenizer):
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
-    Attributes:
+    Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
-        sp_model (:obj:`SentencePieceProcessor`):
+    conversion (string, tokens and IDs).
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """
    vocab_files_names = VOCAB_FILES_NAMES
@@ -142,9 +140,8 @@ class CamembertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An CamemBERT sequence has the following format:
-        An CamemBERT sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -199,8 +196,8 @@ class CamembertTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
-        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_camembert_fast.py
+++ b/src/transformers/tokenization_camembert_fast.py
@@ -66,11 +66,10 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
-    vocab_file (:obj:`str`):
+    vocab_file (:obj:`str`): `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm`
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+    extension) that contains the vocabulary necessary to instantiate a tokenizer. bos_token (:obj:`str`, `optional`,
-            contains the vocabulary necessary to instantiate a tokenizer.
+    defaults to :obj:`"<s>"`): The beginning of sequence token that was used during pretraining. Can be used a sequence
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+    classifier token.
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
            .. note::
@@ -147,9 +146,8 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An CamemBERT sequence has the following format:
-        An CamemBERT sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``
@@ -204,8 +202,8 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
-        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_ctrl.py
+++ b/src/transformers/tokenization_ctrl.py
@@ -101,7 +101,8 @@ CONTROL_CODES = {
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
+    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """

--- a/src/transformers/tokenization_deberta.py
+++ b/src/transformers/tokenization_deberta.py
@@ -63,13 +63,11 @@ __all__ = ["DebertaTokenizer"]
 @lru_cache()
 def bytes_to_unicode():
    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode
-    The reversible bpe codes work on unicode strings.
+    strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
@@ -86,8 +84,9 @@ def bytes_to_unicode():
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
-    Word is represented as tuple of symbols (symbols being variable-length strings).
+    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
+    strings).
    """
    pairs = set()
    prev_char = word[0]
@@ -292,27 +291,29 @@ def load_vocab(name=None, tag=None, no_cache=False, cache_dir=None):
 class GPT2Tokenizer(object):
-    """    A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer
+    """
+    A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer
-  Args:
-    vocab_file (:obj:`str`, optional):
-      The local path of vocabulary package or the release name of vocabulary in `DeBERTa GitHub releases <https://github.com/microsoft/DeBERTa/releases>`_, \
-              e.g. "bpe_encoder", default: `None`.
-          If it's `None`, then it will download the vocabulary in the latest release from GitHub. The vocabulary file is a \
-          state dictionary with three items, "dict_map", "vocab", "encoder" which correspond to three files used in `RoBERTa`, i.e. `dict.txt`, `vocab.txt` and `encoder.json`. \
-          The difference between our wrapped GPT2 tokenizer and RoBERTa wrapped tokenizer are,
-          - Special tokens, unlike `RoBERTa` which use `<s>`, `</s>` as the `start` token and `end` token of a sentence. We use `[CLS]` and `[SEP]` as the `start` and `end`\
+    Args:
-              token of input sentence which is the same as `BERT`.
+        vocab_file (:obj:`str`, optional):
+            The local path of vocabulary package or the release name of vocabulary in `DeBERTa GitHub releases
+            <https://github.com/microsoft/DeBERTa/releases>`_, \ e.g. "bpe_encoder", default: `None`.
-          - We remapped the token ids in our dictionary with regarding to the new special tokens, `[PAD]` => 0, `[CLS]` => 1, `[SEP]` => 2, `[UNK]` => 3, `[MASK]` => 50264
+            If it's `None`, then it will download the vocabulary in the latest release from GitHub. The vocabulary file
+            is a \ state dictionary with three items, "dict_map", "vocab", "encoder" which correspond to three files
+            used in `RoBERTa`, i.e. `dict.txt`, `vocab.txt` and `encoder.json`. \ The difference between our wrapped
+            GPT2 tokenizer and RoBERTa wrapped tokenizer are,
-    special_tokens (:obj:`list`, optional):
+            - Special tokens, unlike `RoBERTa` which use `<s>`, `</s>` as the `start` token and `end` token of a
-      List of special tokens to be added to the end of the vocabulary.
+              sentence. We use `[CLS]` and `[SEP]` as the `start` and `end`\ token of input sentence which is the same
+              as `BERT`.
+            - We remapped the token ids in our dictionary with regarding to the new special tokens, `[PAD]` => 0,
+              `[CLS]` => 1, `[SEP]` => 2, `[UNK]` => 3, `[MASK]` => 50264
-  """
+        special_tokens (:obj:`list`, optional):
+            List of special tokens to be added to the end of the vocabulary.
+    """
    def __init__(self, vocab_file=None, special_tokens=None):
        self.pad_token = "[PAD]"
@@ -344,7 +345,8 @@ class GPT2Tokenizer(object):
        self.ids_to_tokens = self.symbols
    def tokenize(self, text):
-        """Convert an input text to tokens.
+        """
+        Convert an input text to tokens.
        Args:
          text (:obj:`str`): input text to be tokenized.
@@ -364,7 +366,9 @@ class GPT2Tokenizer(object):
        return [t for t in bpe.split(" ") if t]
    def convert_tokens_to_ids(self, tokens):
-        """Convert list of tokens to ids.
+        """
+        Convert list of tokens to ids
        Args:
          tokens (:obj:`list<str>`): list of tokens
@@ -375,7 +379,9 @@ class GPT2Tokenizer(object):
        return [self.vocab[t] for t in tokens]
    def convert_ids_to_tokens(self, ids):
-        """Convert list of ids to tokens.
+        """
+        Convert list of ids to tokens
        Args:
          ids (:obj:`list<int>`): list of ids
@@ -392,7 +398,9 @@ class GPT2Tokenizer(object):
        return self.bpe.split_to_words(text)
    def decode(self, tokens):
-        """Decode list of tokens to text strings.
+        """
+        Decode list of tokens to text strings
        Args:
          tokens (:obj:`list<str>`): list of tokens.
@@ -411,7 +419,9 @@ class GPT2Tokenizer(object):
        return self.bpe.decode([int(t) for t in tokens if t not in self.special_tokens])
    def add_special_token(self, token):
-        """Adds a special token to the dictionary.
+        """
+        Adds a special token to the dictionary
        Args:
          token (:obj:`str`): Tthe new token/word to be added to the vocabulary.
@@ -444,7 +454,9 @@ class GPT2Tokenizer(object):
        return self.bpe.decode(map(int, x.split()))
    def add_symbol(self, word, n=1):
-        """Adds a word to the dictionary.
+        """
+        Adds a word to the dictionary
        Args:
          word (:obj:`str`): Tthe new token/word to be added to the vocabulary.
          n (int, optional): The frequency of the word.
@@ -477,8 +489,7 @@ class GPT2Tokenizer(object):
 class DebertaTokenizer(PreTrainedTokenizer):
    r"""
-    Constructs a DeBERTa tokenizer, which runs end-to-end tokenization: punctuation
+    Constructs a DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece
-    splitting + wordpiece
    Args:
        vocab_file (:obj:`str`):
@@ -489,15 +500,14 @@ class DebertaTokenizer(PreTrainedTokenizer):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            sequence instead of per-token classification). It is the first token of the sequence when built with
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-            special tokens.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
@@ -570,9 +580,8 @@ class DebertaTokenizer(PreTrainedTokenizer):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A BERT sequence has the following format:
-        A BERT sequence has the following format:
        - single sequence: [CLS] X [SEP]
        - pair of sequences: [CLS] A [SEP] B [SEP]
@@ -628,8 +637,8 @@ class DebertaTokenizer(PreTrainedTokenizer):
    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
-        A DeBERTa sequence pair mask has the following format:
+        sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_dpr.py
+++ b/src/transformers/tokenization_dpr.py
@@ -129,20 +129,20 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
 CUSTOM_DPR_READER_DOCSTRING = r"""
-    Return a dictionary with the token ids of the input strings and other information to give to
+        Return a dictionary with the token ids of the input strings and other information to give to
-    :obj:`.decode_best_spans`.
+        :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
-    It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
+        sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of
-    using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
+        size :obj:`(n_passages, sequence_length)` with the format:
-    :obj:`(n_passages, sequence_length)` with the format:
+    ::
        [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
    Args:
        questions (:obj:`str` or :obj:`List[str]`):
-            The questions to be encoded.
+            The questions to be encoded. You can specify one question for many passages. In this case, the question
-            You can specify one question for many passages. In this case, the question will be duplicated like
+            will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
-            :obj:`[questions] * n_passages`.
+            in :obj:`titles` or :obj:`texts`.
-            Otherwise you have to specify as many questions as in :obj:`titles` or :obj:`texts`.
        titles (:obj:`str` or :obj:`List[str]`):
            The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
        texts (:obj:`str` or :obj:`List[str]`):
@@ -150,8 +150,8 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
            Activates and controls padding. Accepts the following values:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              single sequence if provided).
+              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
@@ -161,16 +161,16 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
            * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
              :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
-              provided. This will truncate token by token, removing a token from the longest sequence in the pair
+              provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
-              if a pair of sequences (or a batch of pairs) is provided.
+              pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided. This will only truncate
+              the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
              the maximum acceptable input length for the model if that argument is not provided. This will only
-              truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
-              to the maximum acceptable input length for the model if that argument is not provided. This will only
              truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
-              sequence lengths greater than the model maximum admissible input size).
+              lengths greater than the model maximum admissible input size).
        max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.
@@ -265,15 +265,17 @@ class CustomDPRReaderTokenizerMixin:
    ) -> List[DPRSpanPrediction]:
        """
        Get the span predictions for the extractive Q&A model.
-        Outputs: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`.
-            Each `DPRReaderOutput` is a `Tuple` with:
+        Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
-            **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to other spans
+        `DPRReaderOutput` is a `Tuple` with:
-                in the same passage. It corresponds to the sum of the start and end logits of the span.
-            **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+            - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
-                compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
+              other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
-            **doc_id**: ``int``` the id of the passage.
+            - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
-            **start_index**: ``int`` the start index of the span (inclusive).
+              compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
-            **end_index**: ``int`` the end index of the span (inclusive).
+            - **doc_id**: ``int``` the id of the passage.
+            - **start_index**: ``int`` the start index of the span (inclusive).
+            - **end_index**: ``int`` the end index of the span (inclusive).
        Examples::
@@ -336,9 +338,8 @@ class CustomDPRReaderTokenizerMixin:
        top_spans: int,
    ) -> List[DPRSpanPrediction]:
        """
-        Finds the best answer span for the extractive Q&A model for one passage.
+        Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending
-        It returns the best span by descending `span_score` order and keeping max `top_spans` spans.
+        `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored.
-        Spans longer that `max_answer_length` are ignored.
        """
        scores = []
        for (start_index, start_score) in enumerate(start_logits):

--- a/src/transformers/tokenization_dpr_fast.py
+++ b/src/transformers/tokenization_dpr_fast.py
@@ -132,20 +132,18 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
 CUSTOM_DPR_READER_DOCSTRING = r"""
-    Return a dictionary with the token ids of the input strings and other information to give to
+        Return a dictionary with the token ids of the input strings and other information to give to
-    :obj:`.decode_best_spans`.
+        :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
-    It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers),
+        sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of
-    using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
+        size :obj:`(n_passages, sequence_length)` with the format:
-    :obj:`(n_passages, sequence_length)` with the format:
        [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
    Args:
        questions (:obj:`str` or :obj:`List[str]`):
-            The questions to be encoded.
+            The questions to be encoded. You can specify one question for many passages. In this case, the question
-            You can specify one question for many passages. In this case, the question will be duplicated like
+            will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
-            :obj:`[questions] * n_passages`.
+            in :obj:`titles` or :obj:`texts`.
-            Otherwise you have to specify as many questions as in :obj:`titles` or :obj:`texts`.
        titles (:obj:`str` or :obj:`List[str]`):
            The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
        texts (:obj:`str` or :obj:`List[str]`):
@@ -153,8 +151,8 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
            Activates and controls padding. Accepts the following values:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              single sequence if provided).
+              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
@@ -164,16 +162,16 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
            * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
              :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
-              provided. This will truncate token by token, removing a token from the longest sequence in the pair
+              provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
-              if a pair of sequences (or a batch of pairs) is provided.
+              pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided. This will only truncate
+              the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
              the maximum acceptable input length for the model if that argument is not provided. This will only
-              truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
-              to the maximum acceptable input length for the model if that argument is not provided. This will only
              truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
-              sequence lengths greater than the model maximum admissible input size).
+              lengths greater than the model maximum admissible input size).
        max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.
@@ -268,15 +266,17 @@ class CustomDPRReaderTokenizerMixin:
    ) -> List[DPRSpanPrediction]:
        """
        Get the span predictions for the extractive Q&A model.
-        Outputs: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`.
-            Each `DPRReaderOutput` is a `Tuple` with:
+        Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
-            **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to other spans
+        `DPRReaderOutput` is a `Tuple` with:
-                in the same passage. It corresponds to the sum of the start and end logits of the span.
-            **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+            - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
-                compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
+              other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
-            **doc_id**: ``int``` the id of the passage.
+            - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
-            **start_index**: ``int`` the start index of the span (inclusive).
+              compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
-            **end_index**: ``int`` the end index of the span (inclusive).
+            - **doc_id**: ``int``` the id of the passage.
+            - ***start_index**: ``int`` the start index of the span (inclusive).
+            - **end_index**: ``int`` the end index of the span (inclusive).
        Examples::
@@ -339,9 +339,8 @@ class CustomDPRReaderTokenizerMixin:
        top_spans: int,
    ) -> List[DPRSpanPrediction]:
        """
-        Finds the best answer span for the extractive Q&A model for one passage.
+        Finds the best answer span for the extractive Q&A model for one passage. It returns the best span by descending
-        It returns the best span by descending `span_score` order and keeping max `top_spans` spans.
+        `span_score` order and keeping max `top_spans` spans. Spans longer that `max_answer_length` are ignored.
-        Spans longer that `max_answer_length` are ignored.
        """
        scores = []
        for (start_index, start_score) in enumerate(start_logits):

--- a/src/transformers/tokenization_flaubert.py
+++ b/src/transformers/tokenization_flaubert.py
@@ -115,11 +115,14 @@ class FlaubertTokenizer(XLMTokenizer):
        Tokenize a string given language code using Moses.
        Details of tokenization:
-        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`
        Args:
-            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)  (bool). If True, we only apply BPE.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
        Returns:
            List of tokens.

--- a/src/transformers/tokenization_fsmt.py
+++ b/src/transformers/tokenization_fsmt.py
@@ -56,8 +56,8 @@ PRETRAINED_INIT_CONFIGURATION = {
 def get_pairs(word):
    """
-    Return set of symbol pairs in a word.
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
-    word is represented as tuple of symbols (symbols being variable-length strings)
+    strings)
    """
    pairs = set()
    prev_char = word[0]
@@ -164,12 +164,12 @@ class FSMTTokenizer(PreTrainedTokenizer):
            .. note::
-                When building a sequence using special tokens, this is not the token that is used for the beginning
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                of sequence. The token used is the :obj:`cls_token`.
+                sequence. The token used is the :obj:`cls_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            for sequence classification or for a text and a question for question answering.
+            sequence classification or for a text and a question for question answering. It is also used as the last
-            It is also used as the last token of a sequence built with special tokens.
+            token of a sequence built with special tokens.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
@@ -332,12 +332,16 @@ class FSMTTokenizer(PreTrainedTokenizer):
        Tokenize a string given language code using Moses.
        Details of tokenization:
-        - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`
        Args:
-            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it.
-            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE.
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
+              languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
        Returns:
            List of tokens.
@@ -382,9 +386,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. A FAIRSEQ Transformer sequence has the following format:
-        A FAIRSEQ Transformer sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s> B </s>``
@@ -445,8 +448,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
-        A FAIRSEQ Transformer sequence pair mask has the following format:
+        Transformer sequence pair mask has the following format:
        ::
@@ -465,8 +468,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
            sequence(s).
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
-        An FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
+        FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
        """
        sep = [self.sep_token_id]

--- a/src/transformers/tokenization_funnel.py
+++ b/src/transformers/tokenization_funnel.py
@@ -110,8 +110,8 @@ class FunnelTokenizer(BertTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
-        A Funnel Transformer sequence pair mask has the following format:
+        Transformer sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_funnel_fast.py
+++ b/src/transformers/tokenization_funnel_fast.py
@@ -126,8 +126,8 @@ class FunnelTokenizerFast(BertTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
-        A Funnel Transformer sequence pair mask has the following format:
+        Transformer sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_gpt2.py
+++ b/src/transformers/tokenization_gpt2.py
@@ -63,14 +63,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 @lru_cache()
 def bytes_to_unicode():
    """
-    Returns list of utf-8 byte and a mapping to unicode strings.
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
+    characters the bpe code barfs on.
-    The reversible bpe codes work on unicode strings.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    tables between utf-8 bytes and unicode strings.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
@@ -87,7 +86,8 @@ def bytes_to_unicode():
 def get_pairs(word):
-    """Return set of symbol pairs in a word.
+    """
+    Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
@@ -120,7 +120,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    .. note::
-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
+        one).
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

--- a/src/transformers/tokenization_herbert.py
+++ b/src/transformers/tokenization_herbert.py
@@ -40,13 +40,13 @@ class HerbertTokenizer(XLMTokenizer):
    Peculiarities:
-    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation.
+    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
-      Each occurence of a punctuation character will be treated separately.
+      punctuation character will be treated separately.
    - Such pretokenized input is BPE subtokenized
-    This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users
+    This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
-    should refer to the superclass for more information regarding methods.
+    refer to the superclass for more information regarding methods.
    """
    vocab_files_names = VOCAB_FILES_NAMES

--- a/src/transformers/tokenization_herbert_fast.py
+++ b/src/transformers/tokenization_herbert_fast.py
@@ -39,8 +39,8 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
    Peculiarities:
-    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation.
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurence of a
-      Each occurence of a punctuation character will be treated separately.
+      punctuation character will be treated separately.
    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
    should refer to the superclass for more information regarding methods.
@@ -77,9 +77,8 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An HerBERT, like BERT sequence has the following format:
-        An HerBERT, like BERT sequence has the following format:
        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s> B </s>``
@@ -135,8 +134,8 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
-        HerBERT, like BERT sequence pair mask has the following format:
+        BERT sequence pair mask has the following format:
        ::

--- a/src/transformers/tokenization_layoutlm_fast.py
+++ b/src/transformers/tokenization_layoutlm_fast.py
@@ -50,10 +50,10 @@ PRETRAINED_INIT_CONFIGURATION = {
 class LayoutLMTokenizerFast(BertTokenizerFast):
    r"""
-    Constructs a  "Fast" LayoutLMTokenizer.
+    Constructs a "Fast" LayoutLMTokenizer.
-    :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
+    :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
-    tokenization: punctuation splitting + wordpiece.
+    end-to-end tokenization: punctuation splitting + wordpiece.
    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
    parameters.

--- a/src/transformers/tokenization_longformer.py
+++ b/src/transformers/tokenization_longformer.py
@@ -45,8 +45,8 @@ class LongformerTokenizer(RobertaTokenizer):
    r"""
    Construct a Longformer tokenizer.
-    :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to
+    :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
-    the superclass for usage examples and documentation concerning parameters.
+    superclass for usage examples and documentation concerning parameters.
    """
    # merges and vocab same as Roberta
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

--- a/src/transformers/tokenization_mbart.py
+++ b/src/transformers/tokenization_mbart.py
@@ -63,16 +63,16 @@ class MBartTokenizer(XLMRobertaTokenizer):
    :class:`~transformers.MBartTokenizer` is a subclass of :class:`~transformers.XLMRobertaTokenizer` and adds a new
    :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch`
-    Refer to superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning
+    Refer to superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning the
-    the initialization parameters and other methods.
+    initialization parameters and other methods.
    .. warning::
        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
        properly.
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and
+    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
-    ``<language code> <tokens> <eos>``` for target language documents.
+    <tokens> <eos>``` for target language documents.
    Examples::
@@ -149,15 +149,14 @@ class MBartTokenizer(XLMRobertaTokenizer):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens.
+        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
-        An MBART sequence has the following format, where ``X`` represents the sequence:
        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
        - ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
-        BOS is never used.
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+        separator.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_mbart_fast.py
+++ b/src/transformers/tokenization_mbart_fast.py
@@ -79,8 +79,8 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
        ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work
        properly.
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and
+    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
-    ``<language code> <tokens> <eos>``` for target language documents.
+    <tokens> <eos>``` for target language documents.
    Examples::
@@ -145,16 +145,16 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        by concatenating and adding special tokens. The special tokens depend on calling set_lang.
+        adding special tokens. The special tokens depend on calling set_lang.
        An MBART sequence has the following format, where ``X`` represents the sequence:
        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
        - ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]``
-        BOS is never used.
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+        separator.
        Args:
            token_ids_0 (:obj:`List[int]`):

--- a/src/transformers/tokenization_openai.py
+++ b/src/transformers/tokenization_openai.py
@@ -44,8 +44,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 def get_pairs(word):
    """
-    Return set of symbol pairs in a word.
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
-    word is represented as tuple of symbols (symbols being variable-length strings)
+    strings)
    """
    pairs = set()
    prev_char = word[0]
@@ -57,8 +57,7 @@ def get_pairs(word):
 def text_standardize(text):
    """
-    fixes some issues the spacy tokenizer had on books corpus
+    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
-    also does some whitespace standardization
    """
    text = text.replace("—", "-")
    text = text.replace("–", "-")
@@ -79,8 +78,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
    - uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
      :obj:`BasicTokenizer` if not.
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    methods. Users should refer to this superclass for more information regarding those methods.
+    Users should refer to this superclass for more information regarding those methods.
    Args:
        vocab_file (:obj:`str`):

--- a/src/transformers/tokenization_pegasus.py
+++ b/src/transformers/tokenization_pegasus.py
@@ -39,8 +39,8 @@ class PegasusTokenizer(ReformerTokenizer):
    :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new
    :meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch`
-    Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning
+    Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the
-    the initialization parameters and other methods.
+    initialization parameters and other methods.
    """
    offset = 103  # entries 2-104 are only used for pretraining
    vocab_files_names = VOCAB_FILES_NAMES
@@ -104,15 +104,14 @@ class PegasusTokenizer(ReformerTokenizer):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """
-        Build model inputs from a sequence or a pair of sequences for sequence classification tasks
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        by concatenating and adding special tokens.
+        and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence:
-        A Pegasus sequence has the following format, where ``X`` represents the sequence:
        - single sequence: ``X </s>``
        - pair of sequences: ``A B </s>`` (not intended use)
-        BOS is never used.
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+        separator.
        Args:
            token_ids_0 (:obj:`List[int]`):