"vscode:/vscode.git/clone" did not exist on "edc1e734bfc01109b8c66881d950ebbda032a6d2"
Unverified Commit 45fc8c79 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Make `get_special_tokens_mask` consider all tokens (#11163)

parent 60607465
...@@ -266,12 +266,9 @@ class MPNetTokenizer(PreTrainedTokenizer): ...@@ -266,12 +266,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -201,12 +201,9 @@ class PhobertTokenizer(PreTrainedTokenizer): ...@@ -201,12 +201,9 @@ class PhobertTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -203,12 +203,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer): ...@@ -203,12 +203,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1] return ([0] * len(token_ids_0)) + [1]
......
...@@ -215,12 +215,9 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -215,12 +215,9 @@ class RobertaTokenizer(GPT2Tokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -199,12 +199,10 @@ class Speech2TextTokenizer(PreTrainedTokenizer): ...@@ -199,12 +199,10 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.bos_token_id, self.eos_token_id] else 0, token_ids_0))
prefix_ones = [1] * len(self.prefix_tokens) prefix_ones = [1] * len(self.prefix_tokens)
suffix_ones = [1] suffix_ones = [1]
if token_ids_1 is None: if token_ids_1 is None:
......
...@@ -157,12 +157,10 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -157,12 +157,10 @@ class T5Tokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
# normal case: some special tokens # normal case: some special tokens
if token_ids_1 is None: if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1] return ([0] * len(token_ids_0)) + [1]
......
...@@ -510,12 +510,9 @@ class TapasTokenizer(PreTrainedTokenizer): ...@@ -510,12 +510,9 @@ class TapasTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None: if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
......
...@@ -906,16 +906,8 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -906,16 +906,8 @@ class XLMTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(
map(
lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
token_ids_0,
)
) )
if token_ids_1 is not None: if token_ids_1 is not None:
......
...@@ -200,12 +200,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ...@@ -200,12 +200,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return ([0] * len(token_ids_0)) + [1] return ([0] * len(token_ids_0)) + [1]
......
...@@ -206,12 +206,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -206,12 +206,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -172,37 +172,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): ...@@ -172,37 +172,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -270,12 +270,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -270,12 +270,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None: if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
......
...@@ -190,37 +190,6 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -190,37 +190,6 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
return token_ids_0 + sep + cls return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
......
...@@ -670,6 +670,16 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -670,6 +670,16 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
@overload @overload
......
...@@ -225,12 +225,9 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer): ...@@ -225,12 +225,9 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: return super().get_special_tokens_mask(
raise ValueError( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None: if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
......
...@@ -46,8 +46,7 @@ Tips: ...@@ -46,8 +46,7 @@ Tips:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask, :members:
create_token_type_ids_from_sequences, save_vocabulary
{% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%} {% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment