"...git@developer.sourcefind.cn:chenpangpang/open-webui.git" did not exist on "51b594036d028adf62573aa825473fd10286d7af"
Unverified Commit e20fab0b authored by Arthur's avatar Arthur Committed by GitHub
Browse files

Fix bloom add prefix space (#25652)

* properly support Sequence of pretokenizers

* actual fix

* make sure the fix works. Tests are not working for sure!

* hacky way

* add TODO

* update

* add a todo

* nits

* rename test

* nits

* rename test
parent 62396cff
......@@ -135,7 +135,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
if add_prefix_space:
pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
self.backend_tokenizer.decoder = pickle.loads(decoder_state)
......
......@@ -133,3 +133,10 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# maximum sequence length of the positoonal embeddings.
self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
def test_add_prefix_space_fast(self):
tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment