Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a9ab1517
Commit
a9ab1517
authored
Jul 15, 2019
by
thomwolf
Browse files
fix #328
parent
f7cd7392
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
10 deletions
+7
-10
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+7
-10
No files found.
pytorch_transformers/tokenization_bert.py
View file @
a9ab1517
...
@@ -66,15 +66,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
...
@@ -66,15 +66,11 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
def
load_vocab
(
vocab_file
):
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
vocab
=
collections
.
OrderedDict
()
index
=
0
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf-8"
)
as
reader
:
with
open
(
vocab_file
,
"r"
,
encoding
=
"utf-8"
)
as
reader
:
while
True
:
tokens
=
reader
.
read
().
splitlines
()
token
=
reader
.
readline
()
for
index
,
token
in
enumerate
(
tokens
):
if
not
token
:
vocab
[
token
]
=
index
break
index
+=
1
token
=
token
.
strip
()
vocab
[
token
]
=
index
index
+=
1
return
vocab
return
vocab
...
@@ -213,7 +209,7 @@ class BasicTokenizer(object):
...
@@ -213,7 +209,7 @@ class BasicTokenizer(object):
self
.
do_lower_case
=
do_lower_case
self
.
do_lower_case
=
do_lower_case
self
.
never_split
=
never_split
self
.
never_split
=
never_split
def
tokenize
(
self
,
text
,
never_split
=
None
):
def
tokenize
(
self
,
text
,
never_split
=
None
,
tokenize_chinese_chars
=
True
):
"""Tokenizes a piece of text."""
"""Tokenizes a piece of text."""
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
never_split
=
self
.
never_split
+
(
never_split
if
never_split
is
not
None
else
[])
text
=
self
.
_clean_text
(
text
)
text
=
self
.
_clean_text
(
text
)
...
@@ -223,7 +219,8 @@ class BasicTokenizer(object):
...
@@ -223,7 +219,8 @@ class BasicTokenizer(object):
# and generally don't have any Chinese data in them (there are Chinese
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
if
tokenize_chinese_chars
:
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
split_tokens
=
[]
for
token
in
orig_tokens
:
for
token
in
orig_tokens
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment