Unverified Commit a1844a32 authored by Ita Zaporozhets's avatar Ita Zaporozhets Committed by GitHub
Browse files

gguf conversion add_prefix_space=None for llama3 (#31937)

* gguf conversion forces add_prefix_space=False for llama3, this is not required and forces from_slow, which fails. changing to None + test

* typo

* clean test
parent 2e113422
...@@ -609,7 +609,7 @@ class GGUFLlamaConverter(LlamaConverter): ...@@ -609,7 +609,7 @@ class GGUFLlamaConverter(LlamaConverter):
self.additional_kwargs["bos_token"] = eos_token self.additional_kwargs["bos_token"] = eos_token
if self.is_llama_3_tokenizer: if self.is_llama_3_tokenizer:
self.additional_kwargs["add_prefix_space"] = False self.additional_kwargs["add_prefix_space"] = None
self.additional_kwargs["clean_up_tokenization_spaces"] = True self.additional_kwargs["clean_up_tokenization_spaces"] = True
self.additional_kwargs["legacy"] = False self.additional_kwargs["legacy"] = False
......
...@@ -174,10 +174,13 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -174,10 +174,13 @@ class GgufIntegrationTests(unittest.TestCase):
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_llama3_q4_0_tokenizer(self): def test_llama3_q4_0_tokenizer(self):
tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
special_sentence = "สวัสดี" with tempfile.TemporaryDirectory() as tmpdirname:
predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0]) tokenizer.save_pretrained(tmpdirname)
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence) tokenizer = AutoTokenizer.from_pretrained(tmpdirname)
special_sentence = "สวัสดี"
predicted_text = tokenizer.decode(tokenizer.encode(special_sentence, return_tensors="pt")[0])
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
def test_llama3_q4_0(self): def test_llama3_q4_0(self):
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment