"benchmark/git@developer.sourcefind.cn:change/sglang.git" did not exist on "9c121f2a45dca269c47812379f851f9ca9478852"
Unverified Commit 3eed5530 authored by Guillaume Klein's avatar Guillaume Klein Committed by GitHub
Browse files

Fix properties of unset special tokens in non verbose mode (#17797)


Co-authored-by: default avatarSaulLu <55560583+SaulLu@users.noreply.github.com>
parent b2fdbacc
...@@ -229,7 +229,8 @@ class BartTokenizerFast(PreTrainedTokenizerFast): ...@@ -229,7 +229,8 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
......
...@@ -139,7 +139,8 @@ class DebertaTokenizerFast(GPT2TokenizerFast): ...@@ -139,7 +139,8 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
comprise the space before the *[MASK]*. comprise the space before the *[MASK]*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
......
...@@ -163,7 +163,8 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -163,7 +163,8 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
......
...@@ -235,7 +235,8 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): ...@@ -235,7 +235,8 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
......
...@@ -968,7 +968,8 @@ class SpecialTokensMixin: ...@@ -968,7 +968,8 @@ class SpecialTokensMixin:
""" """
`str`: Beginning of sentence token. Log an error if used while not having been set. `str`: Beginning of sentence token. Log an error if used while not having been set.
""" """
if self._bos_token is None and self.verbose: if self._bos_token is None:
if self.verbose:
logger.error("Using bos_token, but it is not set yet.") logger.error("Using bos_token, but it is not set yet.")
return None return None
return str(self._bos_token) return str(self._bos_token)
...@@ -978,7 +979,8 @@ class SpecialTokensMixin: ...@@ -978,7 +979,8 @@ class SpecialTokensMixin:
""" """
`str`: End of sentence token. Log an error if used while not having been set. `str`: End of sentence token. Log an error if used while not having been set.
""" """
if self._eos_token is None and self.verbose: if self._eos_token is None:
if self.verbose:
logger.error("Using eos_token, but it is not set yet.") logger.error("Using eos_token, but it is not set yet.")
return None return None
return str(self._eos_token) return str(self._eos_token)
...@@ -988,7 +990,8 @@ class SpecialTokensMixin: ...@@ -988,7 +990,8 @@ class SpecialTokensMixin:
""" """
`str`: Unknown token. Log an error if used while not having been set. `str`: Unknown token. Log an error if used while not having been set.
""" """
if self._unk_token is None and self.verbose: if self._unk_token is None:
if self.verbose:
logger.error("Using unk_token, but it is not set yet.") logger.error("Using unk_token, but it is not set yet.")
return None return None
return str(self._unk_token) return str(self._unk_token)
...@@ -999,7 +1002,8 @@ class SpecialTokensMixin: ...@@ -999,7 +1002,8 @@ class SpecialTokensMixin:
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
having been set. having been set.
""" """
if self._sep_token is None and self.verbose: if self._sep_token is None:
if self.verbose:
logger.error("Using sep_token, but it is not set yet.") logger.error("Using sep_token, but it is not set yet.")
return None return None
return str(self._sep_token) return str(self._sep_token)
...@@ -1009,7 +1013,8 @@ class SpecialTokensMixin: ...@@ -1009,7 +1013,8 @@ class SpecialTokensMixin:
""" """
`str`: Padding token. Log an error if used while not having been set. `str`: Padding token. Log an error if used while not having been set.
""" """
if self._pad_token is None and self.verbose: if self._pad_token is None:
if self.verbose:
logger.error("Using pad_token, but it is not set yet.") logger.error("Using pad_token, but it is not set yet.")
return None return None
return str(self._pad_token) return str(self._pad_token)
...@@ -1020,7 +1025,8 @@ class SpecialTokensMixin: ...@@ -1020,7 +1025,8 @@ class SpecialTokensMixin:
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
depth of the model. Log an error if used while not having been set. depth of the model. Log an error if used while not having been set.
""" """
if self._cls_token is None and self.verbose: if self._cls_token is None:
if self.verbose:
logger.error("Using cls_token, but it is not set yet.") logger.error("Using cls_token, but it is not set yet.")
return None return None
return str(self._cls_token) return str(self._cls_token)
...@@ -1031,7 +1037,8 @@ class SpecialTokensMixin: ...@@ -1031,7 +1037,8 @@ class SpecialTokensMixin:
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set. having been set.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
if self.verbose:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
...@@ -1042,7 +1049,8 @@ class SpecialTokensMixin: ...@@ -1042,7 +1049,8 @@ class SpecialTokensMixin:
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
set. set.
""" """
if self._additional_special_tokens is None and self.verbose: if self._additional_special_tokens is None:
if self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.") logger.error("Using additional_special_tokens, but it is not set yet.")
return None return None
return [str(tok) for tok in self._additional_special_tokens] return [str(tok) for tok in self._additional_special_tokens]
......
...@@ -31,6 +31,7 @@ from pathlib import Path ...@@ -31,6 +31,7 @@ from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
from parameterized import parameterized
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import ( from transformers import (
AlbertTokenizer, AlbertTokenizer,
...@@ -578,6 +579,25 @@ class TokenizerTesterMixin: ...@@ -578,6 +579,25 @@ class TokenizerTesterMixin:
self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
@parameterized.expand([(True,), (False,)])
def test_tokenizers_special_tokens_properties_unset(self, verbose):
tokenizers = self.get_tokenizers(verbose=verbose)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
attributes_list = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
for attr in attributes_list:
setattr(tokenizer, attr, None)
self.assertIsNone(getattr(tokenizer, attr))
def test_save_and_load_tokenizer(self): def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works # safety check on max_len default value so we are sure the test works
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment