"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "15143fbad64015ed6778a61695828136f42b5a98"
Unverified Commit 74f7906d authored by Ben Mann's avatar Ben Mann Committed by GitHub
Browse files

Fix #537

parent 2dee8631
...@@ -221,7 +221,10 @@ class GPT2Tokenizer(object): ...@@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """ """ Tokenize a string. """
bpe_tokens = [] bpe_tokens = []
for token in re.findall(self.pat, text): for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token) token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens return bpe_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment