Unverified Commit edd68f4e authored by Matt's avatar Matt Committed by GitHub
Browse files

🚨 No more default chat templates (#31733)

* No more default chat templates

* Add the template to the GPT-SW3 tests since it's not available by default now

* Fix GPT2 test

* Fix Bloom test

* Fix Bloom test

* Remove default templates again
parent 1c122a46
...@@ -241,61 +241,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -241,61 +241,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
return (out_vocab_file,) return (out_vocab_file,)
@property
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
def default_chat_template(self):
"""
LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
results in an unusual token ordering when it is present. This template should definitely be changed if you wish
to fine-tune a model with more flexible role ordering!
The output should look something like:
<bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST]
The reference for this chat template is [this code
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository.
"""
template = (
"{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present
"{% set system_message = messages[0]['content'] %}"
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
"{% set loop_messages = messages %}" # Or use the default system message if the flag is set
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
"{% else %}"
"{% set loop_messages = messages %}"
"{% set system_message = false %}"
"{% endif %}"
"{% for message in loop_messages %}" # Loop over all non-system messages
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
"{% endif %}"
"{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
"{% else %}"
"{% set content = message['content'] %}"
"{% endif %}"
"{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
"{% elif message['role'] == 'system' %}"
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ ' ' + content.strip() + ' ' + eos_token }}"
"{% endif %}"
"{% endfor %}"
)
template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
return template
# TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......
...@@ -159,63 +159,3 @@ class LlavaNextVideoProcessor(ProcessorMixin): ...@@ -159,63 +159,3 @@ class LlavaNextVideoProcessor(ProcessorMixin):
tokenizer_input_names = self.tokenizer.model_input_names tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def default_chat_template(self):
"""
This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
* the template will output the role of the speaker followed by the content of the message.
* content is a list of strings and images.
* If the content element is an image, the template will output a sequence of <image> or <video> tokens
Example:
```python
messages = [{
"role": "user",
"content": [
{"type": "text", "text": "What’s the content of this video?"},
{"type": "video"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "This picture shows a red stop sign."},]
}]
```
Will create outputs like:
```
USER: <video>\nWhat is the content of this video?
ASSITANT: This picture shows a red stop sign
```
"""
# fmt: off
return (
"{% for message in messages %}"
"{% if message['role'] == 'system' %}"
"{{ message['content'][0]['text'] }}"
"{% else %}"
"{{ message['role'].upper() + ': '}}"
"{% endif %}"
"{# Render all images first #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
"{{ '<image>\n' }}"
"{% endfor %}"
"{# Render all videos next #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
"{{ '<video>\n' }}"
"{% endfor %}"
"{# Render all text finally #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
"{{ content['text'] + ' '}}"
"{% endfor %}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ 'ASSISTANT:' }}"
"{% endif %}"
)
# fmt: on
...@@ -810,14 +810,6 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -810,14 +810,6 @@ class WhisperTokenizer(PreTrainedTokenizer):
text = " " + text text = " " + text
return (text, kwargs) return (text, kwargs)
@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
def default_chat_template(self):
"""
A simple chat template that ignores role information and just concatenates messages with EOS tokens.
"""
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps) self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
# prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|> # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
......
...@@ -539,14 +539,6 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -539,14 +539,6 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
def default_chat_template(self):
"""
A simple chat template that ignores role information and just concatenates messages with EOS tokens.
"""
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps) self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
......
...@@ -971,8 +971,8 @@ class ProcessorMixin(PushToHubMixin): ...@@ -971,8 +971,8 @@ class ProcessorMixin(PushToHubMixin):
conversation (`List[Dict, str, str]`): conversation (`List[Dict, str, str]`):
The conversation to format. The conversation to format.
chat_template (`Optional[str]`, *optional*): chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
is used. chat template is used.
tokenize (`bool`, *optional*, defaults to `False`): tokenize (`bool`, *optional*, defaults to `False`):
Whether to tokenize the output or not. Whether to tokenize the output or not.
**kwargs: **kwargs:
...@@ -982,15 +982,6 @@ class ProcessorMixin(PushToHubMixin): ...@@ -982,15 +982,6 @@ class ProcessorMixin(PushToHubMixin):
if chat_template is None: if chat_template is None:
if self.chat_template is not None: if self.chat_template is not None:
chat_template = self.chat_template chat_template = self.chat_template
elif getattr(self, "default_chat_template", None) is not None:
logger.warning_once(
"No chat template is set for this processor, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
chat_template = self.default_chat_template
else: else:
raise ValueError( raise ValueError(
"No chat template is set for this processor. Please either set the `chat_template` attribute, " "No chat template is set for this processor. Please either set the `chat_template` attribute, "
......
...@@ -1704,8 +1704,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1704,8 +1704,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
""" """
Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
determine the format and control tokens to use when converting. When chat_template is None, it will fall back determine the format and control tokens to use when converting.
to the default_chat_template specified at the class level.
Args: Args:
conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts
...@@ -1986,22 +1985,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1986,22 +1985,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Returns: Returns:
`str`: The chat template string. `str`: The chat template string.
""" """
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates # First, handle the cases when the model has a dict of multiple templates
if isinstance(self.chat_template, dict) or ( if isinstance(self.chat_template, dict):
self.chat_template is None and isinstance(self.default_chat_template, dict)
):
if self.chat_template is not None:
template_dict = self.chat_template template_dict = self.chat_template
using_default_dict = False
else:
template_dict = self.default_chat_template
using_default_dict = True
if chat_template is not None and chat_template in template_dict: if chat_template is not None and chat_template in template_dict:
# The user can pass the name of a template to the chat template argument instead of an entire template # The user can pass the name of a template to the chat template argument instead of an entire template
chat_template = template_dict[chat_template] chat_template = template_dict[chat_template]
if using_default_dict:
using_default_template = True
elif chat_template is None: elif chat_template is None:
if tools is not None and "tool_use" in template_dict: if tools is not None and "tool_use" in template_dict:
chat_template = template_dict["tool_use"] chat_template = template_dict["tool_use"]
...@@ -2013,44 +2002,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -2013,44 +2002,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"template or the name of the template you wish to use to the `chat_template` argument. Available " "template or the name of the template you wish to use to the `chat_template` argument. Available "
f"template names are {sorted(template_dict.keys())}." f"template names are {sorted(template_dict.keys())}."
) )
if using_default_dict:
using_default_template = True
elif chat_template is None: elif chat_template is None:
# These are the cases when the model has a single template # These are the cases when the model has a single template
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template # priority: `chat_template` argument > `tokenizer.chat_template`
if self.chat_template is not None: if self.chat_template is not None:
chat_template = self.chat_template chat_template = self.chat_template
else:
chat_template = self.default_chat_template
using_default_template = True
if using_default_template: else:
logger.warning_once( raise ValueError(
"No chat template is set for this tokenizer, falling back to a default class-level template. This is " "Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template "
"very error-prone, because models are often trained with templates different from the class default! " "argument was passed! For information about writing templates and setting the "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " "tokenizer.chat_template attribute, please see the documentation at "
"point any code depending on them will stop working. We recommend setting a valid chat template before " "https://huggingface.co/docs/transformers/main/en/chat_templating"
"then to ensure that this model continues working without issues."
) )
return chat_template return chat_template
@property
def default_chat_template(self):
"""
This template formats inputs in the standard ChatML format. See
https://github.com/openai/openai-python/blob/main/chatml.md
"""
return (
"{% for message in messages %}"
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|im_start|>assistant\n' }}"
"{% endif %}"
)
@classmethod @classmethod
def from_pretrained( def from_pretrained(
cls, cls,
......
...@@ -135,6 +135,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -135,6 +135,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja @require_jinja
def test_tokenization_for_chat(self): def test_tokenization_for_chat(self):
tokenizer = self.get_rust_tokenizer() tokenizer = self.get_rust_tokenizer()
tokenizer.chat_template = "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
test_chats = [ test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[ [
......
...@@ -280,6 +280,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -280,6 +280,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja @require_jinja
def test_tokenization_for_chat(self): def test_tokenization_for_chat(self):
tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname) tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [ test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[ [
......
...@@ -131,6 +131,15 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -131,6 +131,15 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja @require_jinja
def test_tokenization_for_chat(self): def test_tokenization_for_chat(self):
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB) tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
tokenizer.chat_template = (
"{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
"{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
"{{ message['text'] }}{{ bos_token }}"
"{% endfor %}"
"Bot:"
)
# This is in English, but it's just here to make sure the chat control tokens are being added properly # This is in English, but it's just here to make sure the chat control tokens are being added properly
test_chats = [ test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}], [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment