Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
45fc8c79
Unverified
Commit
45fc8c79
authored
Apr 09, 2021
by
Sylvain Gugger
Committed by
GitHub
Apr 09, 2021
Browse files
Make `get_special_tokens_mask` consider all tokens (#11163)
parent
60607465
Changes
36
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
245 deletions
+42
-245
docs/source/model_doc/convbert.rst
docs/source/model_doc/convbert.rst
+1
-2
docs/source/model_doc/led.rst
docs/source/model_doc/led.rst
+1
-2
src/transformers/models/albert/tokenization_albert.py
src/transformers/models/albert/tokenization_albert.py
+3
-6
src/transformers/models/albert/tokenization_albert_fast.py
src/transformers/models/albert/tokenization_albert_fast.py
+0
-31
src/transformers/models/barthez/tokenization_barthez.py
src/transformers/models/barthez/tokenization_barthez.py
+3
-6
src/transformers/models/barthez/tokenization_barthez_fast.py
src/transformers/models/barthez/tokenization_barthez_fast.py
+0
-30
src/transformers/models/bert/tokenization_bert.py
src/transformers/models/bert/tokenization_bert.py
+3
-6
src/transformers/models/bertweet/tokenization_bertweet.py
src/transformers/models/bertweet/tokenization_bertweet.py
+3
-6
src/transformers/models/big_bird/tokenization_big_bird.py
src/transformers/models/big_bird/tokenization_big_bird.py
+3
-6
src/transformers/models/camembert/tokenization_camembert.py
src/transformers/models/camembert/tokenization_camembert.py
+3
-6
src/transformers/models/camembert/tokenization_camembert_fast.py
...nsformers/models/camembert/tokenization_camembert_fast.py
+0
-30
src/transformers/models/deberta/tokenization_deberta.py
src/transformers/models/deberta/tokenization_deberta.py
+3
-6
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
...transformers/models/deberta_v2/tokenization_deberta_v2.py
+2
-10
src/transformers/models/fsmt/tokenization_fsmt.py
src/transformers/models/fsmt/tokenization_fsmt.py
+2
-10
src/transformers/models/herbert/tokenization_herbert_fast.py
src/transformers/models/herbert/tokenization_herbert_fast.py
+3
-6
src/transformers/models/m2m_100/tokenization_m2m_100.py
src/transformers/models/m2m_100/tokenization_m2m_100.py
+4
-6
src/transformers/models/mbart/tokenization_mbart.py
src/transformers/models/mbart/tokenization_mbart.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50.py
src/transformers/models/mbart/tokenization_mbart50.py
+4
-6
src/transformers/models/mbart/tokenization_mbart50_fast.py
src/transformers/models/mbart/tokenization_mbart50_fast.py
+0
-32
src/transformers/models/mbart/tokenization_mbart_fast.py
src/transformers/models/mbart/tokenization_mbart_fast.py
+0
-32
No files found.
docs/source/model_doc/convbert.rst
View file @
45fc8c79
...
...
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
:members:
ConvBertModel
...
...
docs/source/model_doc/led.rst
View file @
45fc8c79
...
...
@@ -73,8 +73,7 @@ LEDTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LEDTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
:members:
LED specific outputs
...
...
src/transformers/models/albert/tokenization_albert.py
View file @
45fc8c79
...
...
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/albert/tokenization_albert_fast.py
View file @
45fc8c79
...
...
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/models/barthez/tokenization_barthez.py
View file @
45fc8c79
...
...
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/barthez/tokenization_barthez_fast.py
View file @
45fc8c79
...
...
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/models/bert/tokenization_bert.py
View file @
45fc8c79
...
...
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
...
...
src/transformers/models/bertweet/tokenization_bertweet.py
View file @
45fc8c79
...
...
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/big_bird/tokenization_big_bird.py
View file @
45fc8c79
...
...
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert.py
View file @
45fc8c79
...
...
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/camembert/tokenization_camembert_fast.py
View file @
45fc8c79
...
...
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
sep
=
[
self
.
sep_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
,
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/models/deberta/tokenization_deberta.py
View file @
45fc8c79
...
...
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/deberta_v2/tokenization_deberta_v2.py
View file @
45fc8c79
...
...
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/fsmt/tokenization_fsmt.py
View file @
45fc8c79
...
...
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
,
)
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
# no bos used in fairseq
if
token_ids_1
is
not
None
:
...
...
src/transformers/models/herbert/tokenization_herbert_fast.py
View file @
45fc8c79
...
...
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
...
...
src/transformers/models/m2m_100/tokenization_m2m_100.py
View file @
45fc8c79
...
...
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart.py
View file @
45fc8c79
...
...
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50.py
View file @
45fc8c79
...
...
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
return
super
().
get_special_tokens_mask
(
token_ids_0
=
token_ids_0
,
token_ids_1
=
token_ids_1
,
already_has_special_tokens
=
True
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
...
...
src/transformers/models/mbart/tokenization_mbart50_fast.py
View file @
45fc8c79
...
...
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
src/transformers/models/mbart/tokenization_mbart_fast.py
View file @
45fc8c79
...
...
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
self
.
_src_lang
=
new_src_lang
self
.
set_src_lang_special_tokens
(
self
.
_src_lang
)
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
prefix_ones
=
[
1
]
*
len
(
self
.
prefix_tokens
)
suffix_ones
=
[
1
]
*
len
(
self
.
suffix_tokens
)
if
token_ids_1
is
None
:
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
suffix_ones
return
prefix_ones
+
([
0
]
*
len
(
token_ids_0
))
+
([
0
]
*
len
(
token_ids_1
))
+
suffix_ones
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment