Use updated `model_max_length` when saving tokenizers (#20401)

* Use updated values Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>

Use updated `model_max_length` when saving tokenizers (#20401)
* Use updated values Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
9a5b84a0 · Yih-Dar · GitHub · ad654e44 · 9a5b84a0
Unverified Commit 9a5b84a0 authored Nov 23, 2022 by Yih-Dar Committed by GitHub Nov 23, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 0 deletions

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +8 -0

No files found.
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2082,6 +2082,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        )
        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
+        # target_keys = self.init_kwargs.keys()
+        target_keys = ["model_max_length"]
+        for k in target_keys:
+            if hasattr(self, k):
+                tokenizer_config[k] = getattr(self, k)
        if len(self.init_inputs) > 0:
            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
        for file_id in self.vocab_files_names.keys():