Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
982339d8
Commit
982339d8
authored
Nov 23, 2018
by
thomwolf
Browse files
fixing unicode error
parent
60e01ac4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
15 deletions
+2
-15
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+2
-15
No files found.
pytorch_pretrained_bert/tokenization.py
View file @
982339d8
...
...
@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt"
,
}
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
...
...
@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
index
=
0
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf8"
)
as
reader
:
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf
-
8"
)
as
reader
:
while
True
:
token
=
convert_to_unicode
(
reader
.
readline
()
)
token
=
reader
.
readline
()
if
not
token
:
break
token
=
token
.
strip
()
...
...
@@ -164,7 +154,6 @@ class BasicTokenizer(object):
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
...
...
@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
A list of wordpiece tokens.
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment