Commit 982339d8 authored by thomwolf's avatar thomwolf
Browse files

fixing unicode error

parent 60e01ac4
...@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = { ...@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
} }
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def printable_text(text): def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`.""" """Returns text encoded in a way suitable for print or `tf.logging`."""
...@@ -65,9 +55,9 @@ def load_vocab(vocab_file): ...@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
index = 0 index = 0
with open(vocab_file, "r", encoding="utf8") as reader: with open(vocab_file, "r", encoding="utf-8") as reader:
while True: while True:
token = convert_to_unicode(reader.readline()) token = reader.readline()
if not token: if not token:
break break
token = token.strip() token = token.strip()
...@@ -164,7 +154,6 @@ class BasicTokenizer(object): ...@@ -164,7 +154,6 @@ class BasicTokenizer(object):
def tokenize(self, text): def tokenize(self, text):
"""Tokenizes a piece of text.""" """Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text) text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese # This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't # models. This is also applied to the English models now, but it doesn't
...@@ -290,8 +279,6 @@ class WordpieceTokenizer(object): ...@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
A list of wordpiece tokens. A list of wordpiece tokens.
""" """
text = convert_to_unicode(text)
output_tokens = [] output_tokens = []
for token in whitespace_tokenize(text): for token in whitespace_tokenize(text):
chars = list(token) chars = list(token)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment