Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
45fc8c79
Unverified
Commit
45fc8c79
authored
Apr 09, 2021
by
Sylvain Gugger
Committed by
GitHub
Apr 09, 2021
Browse files
Make `get_special_tokens_mask` consider all tokens (#11163)
parent
60607465
Changes
36
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
245 deletions
+42
-245
docs/source/model_doc/convbert.rst
docs/source/model_doc/convbert.rst
+1
-2
docs/source/model_doc/led.rst
docs/source/model_doc/led.rst
+1
-2
src/transformers/models/albert/tokenization_albert.py
src/transformers/models/albert/tokenization_albert.py
+3
-6
src/transformers/models/albert/tokenization_albert_fast.py
src/transformers/models/albert/tokenization_albert_fast.py
+0
-31
src/transformers/models/barthez/tokenization_barthez.py
src/transformers/models/barthez/tokenization_barthez.py
+3
-6
src/transformers/models/barthez/tokenization_barthez_fast.py
src/transformers/models/barthez/tokenization_barthez_fast.py
+0
-30
src/transformers/models/bert/tokenization_bert.py
src/transformers/models/bert/tokenization_bert.py
+3
-6
src/transformers/models/bertweet/tokenization_bertweet.py
src/transformers/models/bertweet/tokenization_bertweet.py
+3
-6
src/transformers/models/big_bird/tokenization_big_bird.py
src/transformers/models/big_bird/tokenization_big_bird.py
+3
-6
src/transformers/models/camembert/tokenization_camembert.py
src/transformers/models/camembert/tokenization_camembert.py
+3
-6
src/transformers/models/camembert/tokenization_camembert_fast.py
...nsformers/models/camembert/tokenization_camembert_fast.py
+0
-30
src/transformers/models/deberta/tokenization_deberta.py
src/transformers/models/deberta/tokenization_deberta.py
+3
-6
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
...transformers/models/deberta_v2/tokenization_deberta_v2.py
+2
-10
src/transformers/models/fsmt/tokenization_fsmt.py
src/transformers/models/fsmt/tokenization_fsmt.py
+2
-10
src/transformers/models/herbert/tokenization_herbert_fast.py
src/transformers/models/herbert/tokenization_herbert_fast.py
+3
-6
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
+4
-6
src/transformers/models/mbart/tokenization_mbart.py
src/transformers/models/mbart/tokenization_mbart.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50.py
src/transformers/models/mbart/tokenization_mbart50.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50_fast.py
src/transformers/models/mbart/tokenization_mbart50_fast.py
+0
-32
src/transformers/models/mbart/tokenization_mbart_fast.py
src/transformers/models/mbart/tokenization_mbart_fast.py
+0
-32
No files found.
docs/source/model_doc/convbert.rst
View file @
45fc8c79
...
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
...
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizerFast
.. autoclass:: transformers.ConvBertTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
:members:
create_token_type_ids_from_sequences, save_vocabulary
ConvBertModel
ConvBertModel
...
...
docs/source/model_doc/led.rst
View file @
45fc8c79
...
@@ -73,8 +73,7 @@ LEDTokenizerFast
...
@@ -73,8 +73,7 @@ LEDTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LEDTokenizerFast
.. autoclass:: transformers.LEDTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
:members:
create_token_type_ids_from_sequences, save_vocabulary
LED specific outputs
LED specific outputs
...
...
src/transformers/models/albert/tokenization_albert.py
View file @
45fc8c79
...
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/albert/tokenization_albert_fast.py
View file @
45fc8c79
...
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/barthez/tokenization_barthez.py
View file @
45fc8c79
...
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
...
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/barthez/tokenization_barthez_fast.py
View file @
45fc8c79
...
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
...
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/bert/tokenization_bert.py
View file @
45fc8c79
...
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/bertweet/tokenization_bertweet.py
View file @
45fc8c79
...
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
...
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/big_bird/tokenization_big_bird.py
View file @
45fc8c79
...
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
...
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert.py
View file @
45fc8c79
...
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert_fast.py
View file @
45fc8c79
...
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/deberta/tokenization_deberta.py
View file @
45fc8c79
...
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
...
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
View file @
45fc8c79
...
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
...
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
)
)
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/fsmt/tokenization_fsmt.py
View file @
45fc8c79
...
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
...
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
)
)
# no bos used in fairseq
# no bos used in fairseq
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/herbert/tokenization_herbert_fast.py
View file @
45fc8c79
...
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/m2m_100/tokenization_m2m_100.py
View file @
45fc8c79
...
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
...
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart.py
View file @
45fc8c79
...
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
...
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50.py
View file @
45fc8c79
...
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
...
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50_fast.py
View file @
45fc8c79
...
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
...
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/mbart/tokenization_mbart_fast.py
View file @
45fc8c79
...
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
...
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment