Unverified Commit c9785d95 authored by Matt's avatar Matt Committed by GitHub
Browse files

Disable default system prompt for LLaMA (#26765)

* Disable default system prompt for LLaMA

* Update test to not expect default prompt
parent 6df9179c
...@@ -104,7 +104,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -104,7 +104,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces. extra spaces.
use_default_system_prompt (`bool`, *optional*, defaults to `True`): use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used. Whether or not the default system prompt for Llama should be used.
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`): spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to add spaces between special tokens. Whether or not to add spaces between special tokens.
...@@ -149,7 +149,7 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -149,7 +149,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
use_default_system_prompt=True, use_default_system_prompt=False,
spaces_between_special_tokens=False, spaces_between_special_tokens=False,
legacy=None, legacy=None,
**kwargs, **kwargs,
......
...@@ -98,7 +98,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -98,7 +98,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
Whether or not to add an `bos_token` at the start of sequences. Whether or not to add an `bos_token` at the start of sequences.
add_eos_token (`bool`, *optional*, defaults to `False`): add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences. Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `True`): use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used. Whether or not the default system prompt for Llama should be used.
""" """
...@@ -118,7 +118,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -118,7 +118,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
eos_token="</s>", eos_token="</s>",
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
use_default_system_prompt=True, use_default_system_prompt=False,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(
......
...@@ -615,7 +615,7 @@ class LlamaIntegrationTest(unittest.TestCase): ...@@ -615,7 +615,7 @@ class LlamaIntegrationTest(unittest.TestCase):
expected_tokens = [ expected_tokens = [
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962], [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962],
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2], [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2],
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962] [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962]
] ]
# fmt: on # fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment