Update RoBERTa and GPT-2 Tokenizer documentation (fix #1343)

ecfddc60 · LysandreJik · 93f0c5fc · ecfddc60 · ecfddc60
Commit ecfddc60 authored Sep 26, 2019 by LysandreJik
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

transformers/tokenization_gpt2.py transformers/tokenization_gpt2.py +4 -3

transformers/tokenization_roberta.py transformers/tokenization_roberta.py +4 -3

No files found.
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -101,9 +101,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    """
    GPT-2 BPE tokenizer. Peculiarities:
        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => will add a space is there isn't.
-          As a consequence, this tokenizer `encode` and `decode` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        - Requires a space to start the input string => the encoding methods should be called with the
+          ``add_prefix_space`` flag set to ``True``.
+          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -66,9 +66,10 @@ class RobertaTokenizer(GPT2Tokenizer):
    """
    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => will add a space is there isn't.
-          As a consequence, this tokenizer `encode` and `decode` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
+        - Requires a space to start the input string => the encoding methods should be called with the
+          ``add_prefix_space`` flag set to ``True``.
+          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP