Unverified Commit 74f7906d authored by Ben Mann's avatar Ben Mann Committed by GitHub
Browse files

Fix #537

parent 2dee8631
......@@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[ord(b)] for b in token)
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment