"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "bfd5da8e28dd05397df3149c707b2197092784d7"
Unverified Commit 3b39b906 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`TokenizerFast`] `can_save_slow_tokenizer` as a property for when...

[`TokenizerFast`] `can_save_slow_tokenizer` as a property for when `vocab_file`'s folder was removed (#25626)

* pad token should be None by default

* fix tests

* nits

* check if isfile vocabfile

* add warning if sp model folder was deleted

* save SPM when missing folder for sloz

* update the ` can_save_slow_tokenizer`  to be a property

* first batch

* second batch

* missing one
parent 99fc3ac8
...@@ -134,7 +134,10 @@ class XGLMTokenizerFast(PreTrainedTokenizerFast): ...@@ -134,7 +134,10 @@ class XGLMTokenizerFast(PreTrainedTokenizerFast):
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
......
...@@ -189,6 +189,10 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ...@@ -189,6 +189,10 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
for k in self.fairseq_tokens_to_ids.keys(): for k in self.fairseq_tokens_to_ids.keys():
self.unique_no_split_tokens.append(k) self.unique_no_split_tokens.append(k)
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def __getstate__(self): def __getstate__(self):
state = self.__dict__.copy() state = self.__dict__.copy()
state["sp_model"] = None state["sp_model"] = None
......
...@@ -166,7 +166,10 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): ...@@ -166,7 +166,10 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
......
...@@ -169,7 +169,10 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast): ...@@ -169,7 +169,10 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
self.remove_space = remove_space self.remove_space = remove_space
self.keep_accents = keep_accents self.keep_accents = keep_accents
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
......
...@@ -90,7 +90,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -90,7 +90,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class: PreTrainedTokenizer = None slow_tokenizer_class: PreTrainedTokenizer = None
can_save_slow_tokenizer: bool = True
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
tokenizer_object = kwargs.pop("tokenizer_object", None) tokenizer_object = kwargs.pop("tokenizer_object", None)
...@@ -159,6 +158,14 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -159,6 +158,14 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def is_fast(self) -> bool: def is_fast(self) -> bool:
return True return True
@property
def can_save_slow_tokenizer(self) -> bool:
"""
`bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
"""
return True
@property @property
def vocab_size(self) -> int: def vocab_size(self) -> int:
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment