Commit 8b388827 authored by thomwolf's avatar thomwolf
Browse files

fix #1920

parent d425a4d6
......@@ -192,9 +192,9 @@ class CTRLTokenizer(PreTrainedTokenizer):
"""
split_tokens = []
text = text.split(' ')
words = re.findall(r'\S+\n?', text)
for token in text:
for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
return split_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment