Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
f24a228a
Commit
f24a228a
authored
Dec 13, 2019
by
Lysandre
Browse files
Speed up tokenization process
parent
c8ed1c82
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
7 additions
and
5 deletions
+7
-5
transformers/data/processors/squad.py
transformers/data/processors/squad.py
+1
-1
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+6
-4
No files found.
transformers/data/processors/squad.py
View file @
f24a228a
...
@@ -116,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
...
@@ -116,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
unique_id
=
1000000000
unique_id
=
1000000000
features
=
[]
features
=
[]
for
(
example_index
,
example
)
in
enumerate
(
tqdm
(
examples
)):
for
(
example_index
,
example
)
in
enumerate
(
tqdm
(
examples
,
desc
=
"Converting examples to features"
)):
if
is_training
and
not
example
.
is_impossible
:
if
is_training
and
not
example
.
is_impossible
:
# Get start and end position
# Get start and end position
start_position
=
example
.
start_position
start_position
=
example
.
start_position
...
...
transformers/tokenization_utils.py
View file @
f24a228a
...
@@ -637,9 +637,11 @@ class PreTrainedTokenizer(object):
...
@@ -637,9 +637,11 @@ class PreTrainedTokenizer(object):
text: The sequence to be encoded.
text: The sequence to be encoded.
**kwargs: passed to the child `self.tokenize()` method
**kwargs: passed to the child `self.tokenize()` method
"""
"""
all_special_tokens
=
self
.
all_special_tokens
def
lowercase_text
(
t
):
def
lowercase_text
(
t
):
# convert non-special tokens to lowercase
# convert non-special tokens to lowercase
escaped_special_toks
=
[
re
.
escape
(
s_tok
)
for
s_tok
in
self
.
all_special_tokens
]
escaped_special_toks
=
[
re
.
escape
(
s_tok
)
for
s_tok
in
all_special_tokens
]
pattern
=
r
'(^'
+
r
'|'
.
join
(
escaped_special_toks
)
+
r
')|'
+
\
pattern
=
r
'(^'
+
r
'|'
.
join
(
escaped_special_toks
)
+
r
')|'
+
\
r
'(.+?)'
r
'(.+?)'
return
re
.
sub
(
return
re
.
sub
(
...
@@ -680,17 +682,17 @@ class PreTrainedTokenizer(object):
...
@@ -680,17 +682,17 @@ class PreTrainedTokenizer(object):
tokenized_text
=
[]
tokenized_text
=
[]
for
sub_text
in
text_list
:
for
sub_text
in
text_list
:
if
sub_text
not
in
self
.
added_tokens_encoder
\
if
sub_text
not
in
self
.
added_tokens_encoder
\
and
sub_text
not
in
self
.
all_special_tokens
:
and
sub_text
not
in
all_special_tokens
:
tokenized_text
+=
split_on_token
(
tok
,
sub_text
)
tokenized_text
+=
split_on_token
(
tok
,
sub_text
)
else
:
else
:
tokenized_text
+=
[
sub_text
]
tokenized_text
+=
[
sub_text
]
text_list
=
tokenized_text
text_list
=
tokenized_text
return
list
(
itertools
.
chain
.
from_iterable
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
return
list
(
itertools
.
chain
.
from_iterable
((
self
.
_tokenize
(
token
,
**
kwargs
)
if
token
not
\
in
self
.
added_tokens_encoder
and
token
not
in
self
.
all_special_tokens
\
in
self
.
added_tokens_encoder
and
token
not
in
all_special_tokens
\
else
[
token
]
for
token
in
tokenized_text
)))
else
[
token
]
for
token
in
tokenized_text
)))
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
self
.
all_special_tokens
added_tokens
=
list
(
self
.
added_tokens_encoder
.
keys
())
+
all_special_tokens
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
tokenized_text
=
split_on_tokens
(
added_tokens
,
text
)
return
tokenized_text
return
tokenized_text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment