"awq/git@developer.sourcefind.cn:OpenDAS/autoawq.git" did not exist on "48be2ee2531e5cdd8d9b2fe5c825cdaf95601908"
Unverified Commit 3eed5530 authored by Guillaume Klein's avatar Guillaume Klein Committed by GitHub
Browse files

Fix properties of unset special tokens in non verbose mode (#17797)


Co-authored-by: default avatarSaulLu <55560583+SaulLu@users.noreply.github.com>
parent b2fdbacc
......@@ -229,8 +229,9 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
......
......@@ -139,8 +139,9 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
comprise the space before the *[MASK]*.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
......
......@@ -163,8 +163,9 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
......
......@@ -235,8 +235,9 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
......
......@@ -968,8 +968,9 @@ class SpecialTokensMixin:
"""
`str`: Beginning of sentence token. Log an error if used while not having been set.
"""
if self._bos_token is None and self.verbose:
logger.error("Using bos_token, but it is not set yet.")
if self._bos_token is None:
if self.verbose:
logger.error("Using bos_token, but it is not set yet.")
return None
return str(self._bos_token)
......@@ -978,8 +979,9 @@ class SpecialTokensMixin:
"""
`str`: End of sentence token. Log an error if used while not having been set.
"""
if self._eos_token is None and self.verbose:
logger.error("Using eos_token, but it is not set yet.")
if self._eos_token is None:
if self.verbose:
logger.error("Using eos_token, but it is not set yet.")
return None
return str(self._eos_token)
......@@ -988,8 +990,9 @@ class SpecialTokensMixin:
"""
`str`: Unknown token. Log an error if used while not having been set.
"""
if self._unk_token is None and self.verbose:
logger.error("Using unk_token, but it is not set yet.")
if self._unk_token is None:
if self.verbose:
logger.error("Using unk_token, but it is not set yet.")
return None
return str(self._unk_token)
......@@ -999,8 +1002,9 @@ class SpecialTokensMixin:
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
having been set.
"""
if self._sep_token is None and self.verbose:
logger.error("Using sep_token, but it is not set yet.")
if self._sep_token is None:
if self.verbose:
logger.error("Using sep_token, but it is not set yet.")
return None
return str(self._sep_token)
......@@ -1009,8 +1013,9 @@ class SpecialTokensMixin:
"""
`str`: Padding token. Log an error if used while not having been set.
"""
if self._pad_token is None and self.verbose:
logger.error("Using pad_token, but it is not set yet.")
if self._pad_token is None:
if self.verbose:
logger.error("Using pad_token, but it is not set yet.")
return None
return str(self._pad_token)
......@@ -1020,8 +1025,9 @@ class SpecialTokensMixin:
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
depth of the model. Log an error if used while not having been set.
"""
if self._cls_token is None and self.verbose:
logger.error("Using cls_token, but it is not set yet.")
if self._cls_token is None:
if self.verbose:
logger.error("Using cls_token, but it is not set yet.")
return None
return str(self._cls_token)
......@@ -1031,8 +1037,9 @@ class SpecialTokensMixin:
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set.
"""
if self._mask_token is None and self.verbose:
logger.error("Using mask_token, but it is not set yet.")
if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None
return str(self._mask_token)
......@@ -1042,8 +1049,9 @@ class SpecialTokensMixin:
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
set.
"""
if self._additional_special_tokens is None and self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.")
if self._additional_special_tokens is None:
if self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.")
return None
return [str(tok) for tok in self._additional_special_tokens]
......
......@@ -31,6 +31,7 @@ from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import (
AlbertTokenizer,
......@@ -578,6 +579,25 @@ class TokenizerTesterMixin:
self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
@parameterized.expand([(True,), (False,)])
def test_tokenizers_special_tokens_properties_unset(self, verbose):
tokenizers = self.get_tokenizers(verbose=verbose)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
attributes_list = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
for attr in attributes_list:
setattr(tokenizer, attr, None)
self.assertIsNone(getattr(tokenizer, attr))
def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works
tokenizers = self.get_tokenizers()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment