Commit 8b388827 authored by thomwolf's avatar thomwolf
Browse files

fix #1920

parent d425a4d6
...@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer): ...@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
""" """
split_tokens = [] split_tokens = []
text = text.split(' ') words = re.findall(r'\S+\n?', text)
for token in text: for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')]) split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens return split_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment