Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b262577d
Commit
b262577d
authored
Dec 25, 2019
by
vitaliyradchenko
Browse files
add special tokens to unique_added_tokens_encoder
parent
83a23479
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
1 deletion
+4
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+4
-1
No files found.
src/transformers/tokenization_utils.py
View file @
b262577d
...
@@ -469,6 +469,9 @@ class PreTrainedTokenizer(object):
...
@@ -469,6 +469,9 @@ class PreTrainedTokenizer(object):
tokenizer
.
init_inputs
=
init_inputs
tokenizer
.
init_inputs
=
init_inputs
tokenizer
.
init_kwargs
=
init_kwargs
tokenizer
.
init_kwargs
=
init_kwargs
# update unique_added_tokens_encoder with special tokens for correct tokenization
tokenizer
.
unique_added_tokens_encoder
.
update
(
set
(
tokenizer
.
all_special_tokens
))
# Add supplementary tokens.
# Add supplementary tokens.
if
added_tokens_file
is
not
None
:
if
added_tokens_file
is
not
None
:
with
open
(
added_tokens_file
,
encoding
=
"utf-8"
)
as
added_tokens_handle
:
with
open
(
added_tokens_file
,
encoding
=
"utf-8"
)
as
added_tokens_handle
:
...
@@ -476,7 +479,7 @@ class PreTrainedTokenizer(object):
...
@@ -476,7 +479,7 @@ class PreTrainedTokenizer(object):
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
added_tok_decoder
=
{
v
:
k
for
k
,
v
in
added_tok_encoder
.
items
()}
tokenizer
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
tokenizer
.
added_tokens_encoder
.
update
(
added_tok_encoder
)
tokenizer
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
tokenizer
.
added_tokens_decoder
.
update
(
added_tok_decoder
)
tokenizer
.
unique_added_tokens_encoder
.
update
(
set
(
tokenizer
.
added_tokens_encoder
.
keys
())
.
union
(
set
(
tokenizer
.
all_special_tokens
))
)
tokenizer
.
unique_added_tokens_encoder
.
update
(
set
(
tokenizer
.
added_tokens_encoder
.
keys
()))
return
tokenizer
return
tokenizer
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment