Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
45fc8c79
Unverified
Commit
45fc8c79
authored
Apr 09, 2021
by
Sylvain Gugger
Committed by
GitHub
Apr 09, 2021
Browse files
Make `get_special_tokens_mask` consider all tokens (#11163)
parent
60607465
Changes
36
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
245 deletions
+42
-245
docs/source/model_doc/convbert.rst
docs/source/model_doc/convbert.rst
+1
-2
docs/source/model_doc/led.rst
docs/source/model_doc/led.rst
+1
-2
src/transformers/models/albert/tokenization_albert.py
src/transformers/models/albert/tokenization_albert.py
+3
-6
src/transformers/models/albert/tokenization_albert_fast.py
src/transformers/models/albert/tokenization_albert_fast.py
+0
-31
src/transformers/models/barthez/tokenization_barthez.py
src/transformers/models/barthez/tokenization_barthez.py
+3
-6
src/transformers/models/barthez/tokenization_barthez_fast.py
src/transformers/models/barthez/tokenization_barthez_fast.py
+0
-30
src/transformers/models/bert/tokenization_bert.py
src/transformers/models/bert/tokenization_bert.py
+3
-6
src/transformers/models/bertweet/tokenization_bertweet.py
src/transformers/models/bertweet/tokenization_bertweet.py
+3
-6
src/transformers/models/big_bird/tokenization_big_bird.py
src/transformers/models/big_bird/tokenization_big_bird.py
+3
-6
src/transformers/models/camembert/tokenization_camembert.py
src/transformers/models/camembert/tokenization_camembert.py
+3
-6
src/transformers/models/camembert/tokenization_camembert_fast.py
...nsformers/models/camembert/tokenization_camembert_fast.py
+0
-30
src/transformers/models/deberta/tokenization_deberta.py
src/transformers/models/deberta/tokenization_deberta.py
+3
-6
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
...transformers/models/deberta_v2/tokenization_deberta_v2.py
+2
-10
src/transformers/models/fsmt/tokenization_fsmt.py
src/transformers/models/fsmt/tokenization_fsmt.py
+2
-10
src/transformers/models/herbert/tokenization_herbert_fast.py
src/transformers/models/herbert/tokenization_herbert_fast.py
+3
-6
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
+4
-6
src/transformers/models/mbart/tokenization_mbart.py
src/transformers/models/mbart/tokenization_mbart.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50.py
src/transformers/models/mbart/tokenization_mbart50.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50_fast.py
src/transformers/models/mbart/tokenization_mbart50_fast.py
+0
-32
src/transformers/models/mbart/tokenization_mbart_fast.py
src/transformers/models/mbart/tokenization_mbart_fast.py
+0
-32
No files found.
docs/source/model_doc/convbert.rst
View file @
45fc8c79
...
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
...
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizerFast
.. autoclass:: transformers.ConvBertTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
:members:
create_token_type_ids_from_sequences, save_vocabulary
ConvBertModel
ConvBertModel
...
...
docs/source/model_doc/led.rst
View file @
45fc8c79
...
@@ -73,8 +73,7 @@ LEDTokenizerFast
...
@@ -73,8 +73,7 @@ LEDTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LEDTokenizerFast
.. autoclass:: transformers.LEDTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
:members:
create_token_type_ids_from_sequences, save_vocabulary
LED specific outputs
LED specific outputs
...
...
src/transformers/models/albert/tokenization_albert.py
View file @
45fc8c79
...
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
...
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/albert/tokenization_albert_fast.py
View file @
45fc8c79
...
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/barthez/tokenization_barthez.py
View file @
45fc8c79
...
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
...
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/barthez/tokenization_barthez_fast.py
View file @
45fc8c79
...
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
...
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/bert/tokenization_bert.py
View file @
45fc8c79
...
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/bertweet/tokenization_bertweet.py
View file @
45fc8c79
...
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
...
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/big_bird/tokenization_big_bird.py
View file @
45fc8c79
...
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
...
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert.py
View file @
45fc8c79
...
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
...
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert_fast.py
View file @
45fc8c79
...
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/deberta/tokenization_deberta.py
View file @
45fc8c79
...
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
...
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
View file @
45fc8c79
...
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
...
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
)
)
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/fsmt/tokenization_fsmt.py
View file @
45fc8c79
...
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
...
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
)
)
# no bos used in fairseq
# no bos used in fairseq
if
token_ids_1
is
not
None
:
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/herbert/tokenization_herbert_fast.py
View file @
45fc8c79
...
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
...
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/m2m_100/tokenization_m2m_100.py
View file @
45fc8c79
...
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
...
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart.py
View file @
45fc8c79
...
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
...
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50.py
View file @
45fc8c79
...
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
...
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
"""
"""
if
already_has_special_tokens
:
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
return
super
().
get_special_tokens_mask
(
raise
ValueError
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
"You should not supply a second sequence if the provided sequence of "
)
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50_fast.py
View file @
45fc8c79
...
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
...
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
src/transformers/models/mbart/tokenization_mbart_fast.py
View file @
45fc8c79
...
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
...
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
)
->
List
[
int
]:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment