"vscode:/vscode.git/clone" did not exist on "575c9791445531a5b4a42af0f5028b92489c2669"
Unverified Commit ef153425 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`TokenizerFast`] Fix setting prefix space in __init__ (#25563)

* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo
parent 636acc75
......@@ -15,11 +15,9 @@
"""Tokenization classes for Bloom."""
import json
import pickle
from typing import TYPE_CHECKING, List, Optional, Tuple
from tokenizers import pre_tokenizers
from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
......@@ -130,11 +128,16 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
# TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
# check this as they were green before.
pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
self.add_prefix_space = add_prefix_space
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment