"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "3b8b0e01bb89152a0c3102f21e5764cafddbe0b0"
Commit 982339d8 authored by thomwolf's avatar thomwolf
Browse files

fixing unicode error

parent 60e01ac4
......@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
......@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf8") as reader:
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = convert_to_unicode(reader.readline())
token = reader.readline()
if not token:
break
token = token.strip()
......@@ -164,7 +154,6 @@ class BasicTokenizer(object):
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
......@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment