fix #1196 and fix #1285

7a99e4b1 · thomwolf · 7c9f8f93 · 7a99e4b1
Commit 7a99e4b1 authored Sep 26, 2019 by thomwolf
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

pytorch_transformers/tokenization_gpt2.py pytorch_transformers/tokenization_gpt2.py +9 -3

No files found.
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        self.cache[token] = word
        return word

-    def _tokenize(self, text):
-        """ Tokenize a string. """
-        text = ' ' + text  # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
+    def _tokenize(self, text, add_prefix_space=False):
+        """ Tokenize a string.
+            Args:
+                - add_prefix_space (boolean, default False):
+                    Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
+        """
+        if add_prefix_space:
+            text = ' ' + text
+
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            if sys.version_info[0] == 2: