Unverified Commit 7f6e8741 authored by Ita Zaporozhets's avatar Ita Zaporozhets Committed by GitHub
Browse files

add prefix space ignored in llama #29625 (#30964)



* add prefix space ignored in llama #29625

* adding test with add_prefix_space=False

* ruff

---------
Co-authored-by: default avatarIta Zaporozhets <itazaporozhets@Itas-MBP.localdomain>
parent 6657fb5f
...@@ -163,6 +163,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -163,6 +163,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
add_bos_token=add_bos_token, add_bos_token=add_bos_token,
add_eos_token=add_eos_token, add_eos_token=add_eos_token,
use_default_system_prompt=use_default_system_prompt, use_default_system_prompt=use_default_system_prompt,
add_prefix_space=add_prefix_space,
legacy=legacy, legacy=legacy,
**kwargs, **kwargs,
) )
......
...@@ -602,6 +602,10 @@ class LlamaIntegrationTest(unittest.TestCase): ...@@ -602,6 +602,10 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(decoded_tokens, "hello") self.assertEqual(decoded_tokens, "hello")
def test_no_prefix_space(self): def test_no_prefix_space(self):
tokenizer_no_prefix_space = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", add_prefix_space=False)
no_prefix_space_tokens = tokenizer_no_prefix_space.tokenize("Hey")
self.assertEqual(no_prefix_space_tokens, ["H", "ey"])
tokenizer = LlamaTokenizerFast.from_pretrained( tokenizer = LlamaTokenizerFast.from_pretrained(
"huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment