Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
45fc8c79
Unverified
Commit
45fc8c79
authored
Apr 09, 2021
by
Sylvain Gugger
Committed by
GitHub
Apr 09, 2021
Browse files
Make `get_special_tokens_mask` consider all tokens (#11163)
parent
60607465
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
48 additions
and
140 deletions
+48
-140
src/transformers/models/mpnet/tokenization_mpnet.py
src/transformers/models/mpnet/tokenization_mpnet.py
+3
-6
src/transformers/models/phobert/tokenization_phobert.py
src/transformers/models/phobert/tokenization_phobert.py
+3
-6
src/transformers/models/prophetnet/tokenization_prophetnet.py
...transformers/models/prophetnet/tokenization_prophetnet.py
+3
-6
src/transformers/models/roberta/tokenization_roberta.py
src/transformers/models/roberta/tokenization_roberta.py
+3
-6
src/transformers/models/speech_to_text/tokenization_speech_to_text.py
...mers/models/speech_to_text/tokenization_speech_to_text.py
+4
-6
src/transformers/models/t5/tokenization_t5.py
src/transformers/models/t5/tokenization_t5.py
+4
-6
src/transformers/models/tapas/tokenization_tapas.py
src/transformers/models/tapas/tokenization_tapas.py
+3
-6
src/transformers/models/xlm/tokenization_xlm.py
src/transformers/models/xlm/tokenization_xlm.py
+2
-10
src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
...mers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+3
-6
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
...ansformers/models/xlm_roberta/tokenization_xlm_roberta.py
+3
-6
src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
...rmers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+0
-31
src/transformers/models/xlnet/tokenization_xlnet.py
src/transformers/models/xlnet/tokenization_xlnet.py
+3
-6
src/transformers/models/xlnet/tokenization_xlnet_fast.py
src/transformers/models/xlnet/tokenization_xlnet_fast.py
+0
-31
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+10
-0
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
...me}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+3
-6
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
...tter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
+1
-2
No files found.
src/transformers/models/mpnet/tokenization_mpnet.py
View file @
45fc8c79
...
...
@@ -266,12 +266,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/phobert/tokenization_phobert.py
View file @
45fc8c79
...
...
@@ -201,12 +201,9 @@ class PhobertTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/prophetnet/tokenization_prophetnet.py
View file @
45fc8c79
...
...
@@ -203,12 +203,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/roberta/tokenization_roberta.py
View file @
45fc8c79
...
...
@@ -215,12 +215,9 @@ class RobertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/speech_to_text/tokenization_speech_to_text.py
View file @
45fc8c79
...
...
@@ -199,12 +199,10 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
bos_token_id
,
self
.
eos_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
if
token_ids_1
is
None
:
...
...
src/transformers/models/t5/tokenization_t5.py
View file @
45fc8c79
...
...
@@ -157,12 +157,10 @@ class T5Tokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
# normal case: some special tokens
if
token_ids_1
is
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/tapas/tokenization_tapas.py
View file @
45fc8c79
...
...
@@ -510,12 +510,9 @@ class TapasTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
...
...
src/transformers/models/xlm/tokenization_xlm.py
View file @
45fc8c79
...
...
@@ -906,16 +906,8 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
View file @
45fc8c79
...
...
@@ -200,12 +200,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
View file @
45fc8c79
...
...
@@ -206,12 +206,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
View file @
45fc8c79
...
...
@@ -172,37 +172,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/models/xlnet/tokenization_xlnet.py
View file @
45fc8c79
...
...
@@ -270,12 +270,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
not
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
,
1
]
...
...
src/transformers/models/xlnet/tokenization_xlnet_fast.py
View file @
45fc8c79
...
...
@@ -190,37 +190,6 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
return
token_ids_0
+
sep
+
cls
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
,
1
]
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/tokenization_utils.py
View file @
45fc8c79
...
...
@@ -670,6 +670,16 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
[
0
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
@
overload
...
...
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
View file @
45fc8c79
...
...
@@ -225,12 +225,9 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
View file @
45fc8c79
...
...
@@ -46,8 +46,7 @@ Tips:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
:members:
{% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%}
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment