Commit 7a99e4b1 authored by thomwolf's avatar thomwolf
Browse files

fix #1196 and fix #1285

parent 7c9f8f93
...@@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.cache[token] = word self.cache[token] = word
return word return word
def _tokenize(self, text): def _tokenize(self, text, add_prefix_space=False):
""" Tokenize a string. """ """ Tokenize a string.
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with. Args:
- add_prefix_space (boolean, default False):
Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
if add_prefix_space:
text = ' ' + text
bpe_tokens = [] bpe_tokens = []
for token in re.findall(self.pat, text): for token in re.findall(self.pat, text):
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment