"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "83eda6435e7c842e55b42a529e9bf367bf2a126b"
Unverified Commit e8952017 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #564 from 8enmann/patch-2

Fix #537
parents 2dee8631 74f7906d
...@@ -221,7 +221,10 @@ class GPT2Tokenizer(object): ...@@ -221,7 +221,10 @@ class GPT2Tokenizer(object):
""" Tokenize a string. """ """ Tokenize a string. """
bpe_tokens = [] bpe_tokens = []
for token in re.findall(self.pat, text): for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[ord(b)] for b in token) if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens return bpe_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment