[`Tokenizer Serialization`] Fix the broken serialisation (#27099)

* nits * nits * actual fix * style * ze fix * fix fix fix style

[`Tokenizer Serialization`] Fix the broken serialisation (#27099)
* nits * nits * actual fix * style * ze fix * fix fix fix style
230ac352 · Arthur · GitHub · f4db565b · 230ac352 · 230ac352
Unverified Commit 230ac352 authored Dec 13, 2023 by Arthur Committed by GitHub Dec 13, 2023
Showing with 7 additions and 5 deletions

src/transformers/models/pegasus/tokenization_pegasus_fast.py src/transformers/models/pegasus/tokenization_pegasus_fast.py +2 -0

src/transformers/tokenization_utils_base.py src/transformers/tokenization_utils_base.py +5 -5

No files found.
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -145,6 +145,8 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
        from_slow = kwargs.pop("from_slow", None)
        from_slow = from_slow or str(pad_token) != "<pad>" or str(eos_token) != "</s>" or str(unk_token) != "<unk>"

+        kwargs.pop("added_tokens_decoder", {})
+
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2235,7 +2235,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

            # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
            # if `tokenizer_config.json` is `None`
-            if "Fast" not in cls.__name__ and tokenizer_file is not None:
+            if tokenizer_file is not None:
                # This is for slow so can be done before
                with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
                    tokenizer_file_handle = json.load(tokenizer_file_handle)
@@ -2247,14 +2247,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            # end legacy

        # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
+        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
+        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+        init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
        for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
            if added_tokens_map != {} and init_kwargs[key] is not None:
                if key != "additional_special_tokens":
-                    init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key])
+                    init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])

-        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
-        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
-        init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
        # Instantiate the tokenizer.
        try:
            tokenizer = cls(*init_inputs, **init_kwargs)