Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
982339d8
Commit
982339d8
authored
Nov 23, 2018
by
thomwolf
Browse files
fixing unicode error
parent
60e01ac4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
2 additions
and
15 deletions
+2
-15
pytorch_pretrained_bert/tokenization.py
pytorch_pretrained_bert/tokenization.py
+2
-15
No files found.
pytorch_pretrained_bert/tokenization.py
View file @
982339d8
...
@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
...
@@ -38,16 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt"
,
'bert-base-chinese'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt"
,
}
}
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
def
printable_text
(
text
):
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
"""Returns text encoded in a way suitable for print or `tf.logging`."""
...
@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
...
@@ -65,9 +55,9 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
vocab
=
collections
.
OrderedDict
()
index
=
0
index
=
0
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf8"
)
as
reader
:
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf
-
8"
)
as
reader
:
while
True
:
while
True
:
token
=
convert_to_unicode
(
reader
.
readline
()
)
token
=
reader
.
readline
()
if
not
token
:
if
not
token
:
break
break
token
=
token
.
strip
()
token
=
token
.
strip
()
...
@@ -164,7 +154,6 @@ class BasicTokenizer(object):
...
@@ -164,7 +154,6 @@ class BasicTokenizer(object):
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# models. This is also applied to the English models now, but it doesn't
...
@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
...
@@ -290,8 +279,6 @@ class WordpieceTokenizer(object):
A list of wordpiece tokens.
A list of wordpiece tokens.
"""
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
chars
=
list
(
token
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment