Unverified Commit 8fae93ca authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[t5 tokenizer] add info logs (#9897)

* save fast tokenizer + add info logs

* fix tests

* remove the saving of fast tokenizer
parent 80349831
...@@ -308,7 +308,7 @@ class PretrainedConfig(object): ...@@ -308,7 +308,7 @@ class PretrainedConfig(object):
output_config_file = os.path.join(save_directory, CONFIG_NAME) output_config_file = os.path.join(save_directory, CONFIG_NAME)
self.to_json_file(output_config_file, use_diff=True) self.to_json_file(output_config_file, use_diff=True)
logger.info("Configuration saved in {}".format(output_config_file)) logger.info(f"Configuration saved in {output_config_file}")
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
......
...@@ -295,5 +295,6 @@ class T5Tokenizer(PreTrainedTokenizer): ...@@ -295,5 +295,6 @@ class T5Tokenizer(PreTrainedTokenizer):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,) return (out_vocab_file,)
...@@ -160,6 +160,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): ...@@ -160,6 +160,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file) copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,) return (out_vocab_file,)
......
...@@ -1977,11 +1977,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -1977,11 +1977,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True) tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
with open(tokenizer_config_file, "w", encoding="utf-8") as f: with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False)) f.write(json.dumps(tokenizer_config, ensure_ascii=False))
logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
# Sanitize AddedTokens in special_tokens_map # Sanitize AddedTokens in special_tokens_map
write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False) write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
with open(special_tokens_map_file, "w", encoding="utf-8") as f: with open(special_tokens_map_file, "w", encoding="utf-8") as f:
f.write(json.dumps(write_dict, ensure_ascii=False)) f.write(json.dumps(write_dict, ensure_ascii=False))
logger.info(f"Special tokens file saved in {special_tokens_map_file}")
file_names = (tokenizer_config_file, special_tokens_map_file) file_names = (tokenizer_config_file, special_tokens_map_file)
...@@ -2020,6 +2022,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): ...@@ -2020,6 +2022,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
with open(added_tokens_file, "w", encoding="utf-8") as f: with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, ensure_ascii=False) out_str = json.dumps(added_vocab, ensure_ascii=False)
f.write(out_str) f.write(out_str)
logger.info(f"added tokens file saved in {added_tokens_file}")
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment