Make `get_special_tokens_mask` consider all tokens (#11163)

45fc8c79 · Sylvain Gugger · GitHub · 60607465 · 45fc8c79 · 45fc8c79
Unverified Commit 45fc8c79 authored Apr 09, 2021 by Sylvain Gugger Committed by GitHub Apr 09, 2021
20 changed files
--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.ConvBertTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+    :members:
-        create_token_type_ids_from_sequences, save_vocabulary
 ConvBertModel

--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -73,8 +73,7 @@ LEDTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.LEDTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+    :members:
-        create_token_type_ids_from_sequences, save_vocabulary
 LED specific outputs

--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
            return cls + token_ids_0 + sep
        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -290,12 +290,9 @@ class BertTokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]

--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -220,12 +220,9 @@ class BertweetTokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -219,12 +219,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -178,12 +178,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -162,36 +162,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -174,12 +174,9 @@ class DebertaTokenizer(GPT2Tokenizer):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -187,16 +187,8 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(
-                map(
-                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
-                    token_ids_0,
-                )
            )
        if token_ids_1 is not None:

--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -437,16 +437,8 @@ class FSMTTokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(
-                map(
-                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
-                    token_ids_0,
-                )
            )
        # no bos used in fairseq
        if token_ids_1 is not None:

--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -126,12 +126,9 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]

--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -207,12 +207,10 @@ class M2M100Tokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)
        if token_ids_1 is None:

--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -149,12 +149,10 @@ class MBartTokenizer(XLMRobertaTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)
        if token_ids_1 is None:

--- a/src/transformers/models/mbart/tokenization_mbart50.py
+++ b/src/transformers/models/mbart/tokenization_mbart50.py
@@ -241,12 +241,10 @@ class MBart50Tokenizer(PreTrainedTokenizer):
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
+            return super().get_special_tokens_mask(
-                raise ValueError(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-                    "You should not supply a second sequence if the provided sequence of "
+            )
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
        prefix_ones = [1] * len(self.prefix_tokens)
        suffix_ones = [1] * len(self.suffix_tokens)
        if token_ids_1 is None:

--- a/src/transformers/models/mbart/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart50_fast.py
@@ -160,38 +160,6 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -131,38 +131,6 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
        self._src_lang = new_src_lang
        self.set_src_lang_special_tokens(self._src_lang)
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]: