Unverified Commit ef153425 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`TokenizerFast`] Fix setting prefix space in __init__ (#25563)

* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo
parent 636acc75
...@@ -15,11 +15,9 @@ ...@@ -15,11 +15,9 @@
"""Tokenization classes for Bloom.""" """Tokenization classes for Bloom."""
import json import pickle
from typing import TYPE_CHECKING, List, Optional, Tuple from typing import TYPE_CHECKING, List, Optional, Tuple
from tokenizers import pre_tokenizers
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging from ...utils import logging
...@@ -130,11 +128,16 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): ...@@ -130,11 +128,16 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
clean_up_tokenization_spaces=clean_up_tokenization_spaces, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: # check this as they were green before.
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
pre_tok_state["add_prefix_space"] = add_prefix_space decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
self.add_prefix_space = add_prefix_space self.add_prefix_space = add_prefix_space
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment