update GPT2 docstring

fd10d79b · thomwolf · abe734ca · fd10d79b
Commit fd10d79b authored Aug 30, 2019 by thomwolf
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

pytorch_transformers/tokenization_gpt2.py pytorch_transformers/tokenization_gpt2.py +4 -1

No files found.
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -99,7 +99,10 @@ def get_pairs(word):
 class GPT2Tokenizer(PreTrainedTokenizer):
    """
    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level BPE
+        - Byte-level Byte-Pair-Encoding
+        - Requires a space to start the input string => will add a space is there isn't.
+          As a consequence, this tokenizer `encode` and `decode` method will not conserve
+          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP