Unverified Commit 45fc8c79 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Make `get_special_tokens_mask` consider all tokens (#11163)

parent 60607465
...@@ -56,8 +56,7 @@ ConvBertTokenizerFast ...@@ -56,8 +56,7 @@ ConvBertTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizerFast .. autoclass:: transformers.ConvBertTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask, :members:
create_token_type_ids_from_sequences, save_vocabulary
ConvBertModel ConvBertModel
......
...@@ -73,8 +73,7 @@ LEDTokenizerFast ...@@ -73,8 +73,7 @@ LEDTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.LEDTokenizerFast .. autoclass:: transformers.LEDTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask, :members:
create_token_type_ids_from_sequences, save_vocabulary
LED specific outputs LED specific outputs
......
...@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None: if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
......
...@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
return cls + token_ids_0 + sep return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer): ...@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast): ...@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None: if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
......
...@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer): ...@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer): ...@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): ...@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer): ...@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): ...@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(
lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0,
)
) )
if token_ids_1 is not None: if token_ids_1 is not None:
......
...@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer): ...@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(
lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0,
)
) )
# no bos used in fairseq # no bos used in fairseq
if token_ids_1 is not None: if token_ids_1 is not None:
......
...@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast): ...@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer): ...@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens) prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens) suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer): ...@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens) prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens) suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens) prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens) suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast): ...@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
self._src_lang = new_src_lang self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang) self.set_src_lang_special_tokens(self._src_lang)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast): ...@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
self._src_lang = new_src_lang self._src_lang = new_src_lang
self.set_src_lang_special_tokens(self._src_lang) self.set_src_lang_special_tokens(self._src_lang)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] * len(self.suffix_tokens)
if token_ids_1 is None:
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment