Unverified Commit edd68f4e authored by Matt's avatar Matt Committed by GitHub
Browse files

🚨 No more default chat templates (#31733)

* No more default chat templates

* Add the template to the GPT-SW3 tests since it's not available by default now

* Fix GPT2 test

* Fix Bloom test

* Fix Bloom test

* Remove default templates again
parent 1c122a46
......@@ -241,61 +241,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
return (out_vocab_file,)
@property
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template
def default_chat_template(self):
"""
LLaMA uses [INST] and [/INST] to indicate user messages, and <<SYS>> and <</SYS>> to indicate system messages.
Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict
user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering
rather than needing special tokens. The system message is partly 'embedded' in the first user message, which
results in an unusual token ordering when it is present. This template should definitely be changed if you wish
to fine-tune a model with more flexible role ordering!
The output should look something like:
<bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST]
The reference for this chat template is [this code
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository.
"""
template = (
"{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present
"{% set system_message = messages[0]['content'] %}"
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}"
"{% set loop_messages = messages %}" # Or use the default system message if the flag is set
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}"
"{% else %}"
"{% set loop_messages = messages %}"
"{% set system_message = false %}"
"{% endif %}"
"{% for message in loop_messages %}" # Loop over all non-system messages
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
"{% endif %}"
"{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}"
"{% else %}"
"{% set content = message['content'] %}"
"{% endif %}"
"{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}"
"{% elif message['role'] == 'system' %}"
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}"
"{% elif message['role'] == 'assistant' %}"
"{{ ' ' + content.strip() + ' ' + eos_token }}"
"{% endif %}"
"{% endfor %}"
)
template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false")
default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'")
template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message)
return template
# TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......
......@@ -159,63 +159,3 @@ class LlavaNextVideoProcessor(ProcessorMixin):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
@property
def default_chat_template(self):
"""
This default vicuna template formats inputs in the form of a chat history. For each message in the chat history:
* the template will output the role of the speaker followed by the content of the message.
* content is a list of strings and images.
* If the content element is an image, the template will output a sequence of <image> or <video> tokens
Example:
```python
messages = [{
"role": "user",
"content": [
{"type": "text", "text": "What’s the content of this video?"},
{"type": "video"},
],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "This picture shows a red stop sign."},]
}]
```
Will create outputs like:
```
USER: <video>\nWhat is the content of this video?
ASSITANT: This picture shows a red stop sign
```
"""
# fmt: off
return (
"{% for message in messages %}"
"{% if message['role'] == 'system' %}"
"{{ message['content'][0]['text'] }}"
"{% else %}"
"{{ message['role'].upper() + ': '}}"
"{% endif %}"
"{# Render all images first #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}"
"{{ '<image>\n' }}"
"{% endfor %}"
"{# Render all videos next #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}"
"{{ '<video>\n' }}"
"{% endfor %}"
"{# Render all text finally #}"
"{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}"
"{{ content['text'] + ' '}}"
"{% endfor %}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ 'ASSISTANT:' }}"
"{% endif %}"
)
# fmt: on
......@@ -810,14 +810,6 @@ class WhisperTokenizer(PreTrainedTokenizer):
text = " " + text
return (text, kwargs)
@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
def default_chat_template(self):
"""
A simple chat template that ignores role information and just concatenates messages with EOS tokens.
"""
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
# prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
......
......@@ -539,14 +539,6 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
@property
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template
def default_chat_template(self):
"""
A simple chat template that ignores role information and just concatenates messages with EOS tokens.
"""
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
......
......@@ -971,8 +971,8 @@ class ProcessorMixin(PushToHubMixin):
conversation (`List[Dict, str, str]`):
The conversation to format.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template
is used.
The Jinja template to use for formatting the conversation. If not provided, the tokenizer's
chat template is used.
tokenize (`bool`, *optional*, defaults to `False`):
Whether to tokenize the output or not.
**kwargs:
......@@ -982,15 +982,6 @@ class ProcessorMixin(PushToHubMixin):
if chat_template is None:
if self.chat_template is not None:
chat_template = self.chat_template
elif getattr(self, "default_chat_template", None) is not None:
logger.warning_once(
"No chat template is set for this processor, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
chat_template = self.default_chat_template
else:
raise ValueError(
"No chat template is set for this processor. Please either set the `chat_template` attribute, "
......
......@@ -1704,8 +1704,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"""
Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
determine the format and control tokens to use when converting. When chat_template is None, it will fall back
to the default_chat_template specified at the class level.
determine the format and control tokens to use when converting.
Args:
conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts
......@@ -1986,22 +1985,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Returns:
`str`: The chat template string.
"""
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates
if isinstance(self.chat_template, dict) or (
self.chat_template is None and isinstance(self.default_chat_template, dict)
):
if self.chat_template is not None:
template_dict = self.chat_template
using_default_dict = False
else:
template_dict = self.default_chat_template
using_default_dict = True
if isinstance(self.chat_template, dict):
template_dict = self.chat_template
if chat_template is not None and chat_template in template_dict:
# The user can pass the name of a template to the chat template argument instead of an entire template
chat_template = template_dict[chat_template]
if using_default_dict:
using_default_template = True
elif chat_template is None:
if tools is not None and "tool_use" in template_dict:
chat_template = template_dict["tool_use"]
......@@ -2013,44 +2002,23 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
"template or the name of the template you wish to use to the `chat_template` argument. Available "
f"template names are {sorted(template_dict.keys())}."
)
if using_default_dict:
using_default_template = True
elif chat_template is None:
# These are the cases when the model has a single template
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
# priority: `chat_template` argument > `tokenizer.chat_template`
if self.chat_template is not None:
chat_template = self.chat_template
else:
chat_template = self.default_chat_template
using_default_template = True
if using_default_template:
logger.warning_once(
"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
else:
raise ValueError(
"Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template "
"argument was passed! For information about writing templates and setting the "
"tokenizer.chat_template attribute, please see the documentation at "
"https://huggingface.co/docs/transformers/main/en/chat_templating"
)
return chat_template
@property
def default_chat_template(self):
"""
This template formats inputs in the standard ChatML format. See
https://github.com/openai/openai-python/blob/main/chatml.md
"""
return (
"{% for message in messages %}"
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
"{% endfor %}"
"{% if add_generation_prompt %}"
"{{ '<|im_start|>assistant\n' }}"
"{% endif %}"
)
@classmethod
def from_pretrained(
cls,
......
......@@ -135,6 +135,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja
def test_tokenization_for_chat(self):
tokenizer = self.get_rust_tokenizer()
tokenizer.chat_template = "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
......
......@@ -280,6 +280,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
[
......
......@@ -131,6 +131,15 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_jinja
def test_tokenization_for_chat(self):
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
tokenizer.chat_template = (
"{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
"{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
"{{ message['text'] }}{{ bos_token }}"
"{% endfor %}"
"Bot:"
)
# This is in English, but it's just here to make sure the chat control tokens are being added properly
test_chats = [
[{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment