[`TokenizerFast`] Fix setting prefix space in __init__ (#25563)

* properly support Sequence of pretokenizers * actual fix * make sure the fix works. Tests are not working for sure! * hacky way * add TODO * update * add a todo

[`TokenizerFast`] Fix setting prefix space in init (#25563)
* properly support Sequence of pretokenizers * actual fix * make sure the fix works. Tests are not working for sure! * hacky way * add TODO * update * add a todo
ef153425 · Arthur · GitHub · 636acc75 · ef153425
Unverified Commit ef153425 authored Aug 18, 2023 by Arthur Committed by GitHub Aug 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 8 deletions

src/transformers/models/bloom/tokenization_bloom_fast.py src/transformers/models/bloom/tokenization_bloom_fast.py +11 -8

No files found.
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -15,11 +15,9 @@
 """Tokenization classes for Bloom."""
-import json
+import pickle
 from typing import TYPE_CHECKING, List, Optional, Tuple
-from tokenizers import pre_tokenizers
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import logging
@@ -130,11 +128,16 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )
-        pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
+        # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
-        if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
+        # check this as they were green before.
-            pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
+        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
-            pre_tok_state["add_prefix_space"] = add_prefix_space
+        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
-            self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
+        if add_prefix_space:
+            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+            decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+        self.backend_tokenizer.decoder = pickle.loads(decoder_state)
        self.add_prefix_space = add_prefix_space