"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "bcedd0a47197d7587cda97376ac8f6da301191cc"
Unverified Commit e4dad4fe authored by Arthur's avatar Arthur Committed by GitHub
Browse files

Remove-warns (#26483)

* fix stripping

* remove some warnings and update some warnings

* revert changes for other PR
parent 1b8decb0
...@@ -125,7 +125,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -125,7 +125,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
if legacy is None: if legacy is None:
logger.warning_once( logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is" f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thouroughly read the reason why this was added as explained in" " means, and thouroughly read the reason why this was added as explained in"
...@@ -138,7 +138,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -138,7 +138,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token self.add_eos_token = add_eos_token
self.use_default_system_prompt = use_default_system_prompt self.use_default_system_prompt = use_default_system_prompt
self.sp_model = self.get_spm_processor() self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
...@@ -160,9 +160,9 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -160,9 +160,9 @@ class LlamaTokenizer(PreTrainedTokenizer):
return len(self.sp_model.encode(str(self.unk_token))) return len(self.sp_model.encode(str(self.unk_token)))
# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self): def get_spm_processor(self, from_slow=False):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy: # no dependency on protobuf if self.legacy or from_slow: # no dependency on protobuf
tokenizer.Load(self.vocab_file) tokenizer.Load(self.vocab_file)
return tokenizer return tokenizer
......
...@@ -186,7 +186,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -186,7 +186,7 @@ class T5Tokenizer(PreTrainedTokenizer):
if legacy is None: if legacy is None:
logger.warning_once( logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is" f"You are using the default legacy behaviour of the {self.__class__}. This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thouroughly read the reason why this was added as explained in" " means, and thouroughly read the reason why this was added as explained in"
...@@ -195,7 +195,7 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -195,7 +195,7 @@ class T5Tokenizer(PreTrainedTokenizer):
legacy = True legacy = True
self.legacy = legacy self.legacy = legacy
self.sp_model = self.get_spm_processor() self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
self.vocab_file = vocab_file self.vocab_file = vocab_file
self._extra_ids = extra_ids self._extra_ids = extra_ids
...@@ -210,9 +210,10 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -210,9 +210,10 @@ class T5Tokenizer(PreTrainedTokenizer):
**kwargs, **kwargs,
) )
def get_spm_processor(self): # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self, from_slow=False):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
if self.legacy: # no dependency on protobuf if self.legacy or from_slow: # no dependency on protobuf
tokenizer.Load(self.vocab_file) tokenizer.Load(self.vocab_file)
return tokenizer return tokenizer
......
...@@ -979,11 +979,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): ...@@ -979,11 +979,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
) -> str: ) -> str:
self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
if spaces_between_special_tokens:
logger.warning_once(
"spaces_between_special_tokens is deprecated and will be removed in transformers v5. It was adding spaces between `added_tokens`, not special tokens, "
"and does not exist in our fast implementation. Future tokenizers will handle the decoding process on a per-model rule."
)
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | { legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
......
...@@ -2204,11 +2204,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2204,11 +2204,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary." f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary."
) )
else: else:
logger.warning_once(
"Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`, "
" it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again."
" You will see the new `added_tokens_decoder` attribute that will store the relevant information."
)
# begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
if special_tokens_map_file is not None: if special_tokens_map_file is not None:
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
...@@ -2277,16 +2272,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2277,16 +2272,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids # uses the information stored in `added_tokens_decoder`. Checks after addition that we have the same ids
if init_kwargs.get("slow_to_fast", False): if init_kwargs.get("slow_to_fast", False):
tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])]) tokenizer.add_tokens([token for _, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])])
warnings = ""
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]):
if tokenizer.convert_tokens_to_ids(str(token)) != index:
warnings += f"\texpected id: {tokenizer.convert_tokens_to_ids(str(token))}, found: {index}, token: `{token}`,\n"
if len(warnings) > 1:
logger.warn(
f"You are converting a {slow_tokenizer.__class__.__name__} to a {cls.__name__}, but"
f" wrong indexes were founds when adding the `added_tokens` from the `slow` tokenizer to the `fast`. "
f" The following tokens had unexpected id :\n{warnings}. You should try using `from_slow`."
)
# finally we add all the special_tokens to make sure eveything is initialized # finally we add all the special_tokens to make sure eveything is initialized
tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True) tokenizer.add_tokens(tokenizer.all_special_tokens_extended, special_tokens=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment