Unverified Commit 9a5b84a0 authored by Yih-Dar's avatar Yih-Dar Committed by GitHub
Browse files

Use updated `model_max_length` when saving tokenizers (#20401)



* Use updated values
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
parent ad654e44
...@@ -2082,6 +2082,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2082,6 +2082,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
) )
tokenizer_config = copy.deepcopy(self.init_kwargs) tokenizer_config = copy.deepcopy(self.init_kwargs)
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
# target_keys = self.init_kwargs.keys()
target_keys = ["model_max_length"]
for k in target_keys:
if hasattr(self, k):
tokenizer_config[k] = getattr(self, k)
if len(self.init_inputs) > 0: if len(self.init_inputs) > 0:
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys(): for file_id in self.vocab_files_names.keys():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment