"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "314bc6bb4e4bd2ede16cd7c04b3b2a419611d190"
Unverified Commit d933818d authored by Matt's avatar Matt Committed by GitHub
Browse files

Add default template warning (#26637)

* Add default template warnings

* make fixup

* Move warnings to FutureWarning

* Move warnings to FutureWarning

* fix make fixup

* Remove futurewarning
parent de55ead1
...@@ -423,6 +423,12 @@ class BlenderbotTokenizer(PreTrainedTokenizer): ...@@ -423,6 +423,12 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
""" """
A very simple chat template that just adds whitespace between messages. A very simple chat template that just adds whitespace between messages.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
......
...@@ -305,6 +305,12 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast): ...@@ -305,6 +305,12 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
""" """
A very simple chat template that just adds whitespace between messages. A very simple chat template that just adds whitespace between messages.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
......
...@@ -242,6 +242,12 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): ...@@ -242,6 +242,12 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
""" """
A very simple chat template that just adds whitespace between messages. A very simple chat template that just adds whitespace between messages.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
......
...@@ -124,6 +124,12 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast): ...@@ -124,6 +124,12 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
""" """
A very simple chat template that just adds whitespace between messages. A very simple chat template that just adds whitespace between messages.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
......
...@@ -168,4 +168,10 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): ...@@ -168,4 +168,10 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
...@@ -469,7 +469,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -469,7 +469,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository. in the original repository.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
template = ( template = (
"{% if messages[0]['role'] == 'system' %}" "{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
......
...@@ -367,7 +367,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -367,7 +367,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository. in the original repository.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
template = ( template = (
"{% if messages[0]['role'] == 'system' %}" "{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
......
...@@ -363,4 +363,10 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -363,4 +363,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
...@@ -181,4 +181,10 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): ...@@ -181,4 +181,10 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
...@@ -135,4 +135,10 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): ...@@ -135,4 +135,10 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
...@@ -180,6 +180,12 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer): ...@@ -180,6 +180,12 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
""" """
A simple chat template that just adds BOS/EOS tokens around messages while discarding role information. A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{{ bos_token + eos_token + message.content + eos_token }}" "{{ bos_token + eos_token + message.content + eos_token }}"
......
...@@ -321,6 +321,12 @@ class GPTSw3Tokenizer(PreTrainedTokenizer): ...@@ -321,6 +321,12 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
preceding messages. BOS tokens are added between all messages. preceding messages. BOS tokens are added between all messages.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{{ eos_token }}{{ bos_token }}" "{{ eos_token }}{{ bos_token }}"
"{% for message in messages %}" "{% for message in messages %}"
......
...@@ -261,6 +261,12 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): ...@@ -261,6 +261,12 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role
information. information.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{% if not loop.first %}{{ bos_token}}{% endif %}" "{% if not loop.first %}{{ bos_token}}{% endif %}"
......
...@@ -430,7 +430,12 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -430,7 +430,12 @@ class LlamaTokenizer(PreTrainedTokenizer):
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository. in the original repository.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
template = ( template = (
"{% if messages[0]['role'] == 'system' %}" "{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
......
...@@ -224,7 +224,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -224,7 +224,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
in the original repository. in the original repository.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
template = ( template = (
"{% if messages[0]['role'] == 'system' %}" "{% if messages[0]['role'] == 'system' %}"
"{% set loop_messages = messages[1:] %}" # Extract system message if it's present "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
......
...@@ -795,6 +795,12 @@ class WhisperTokenizer(PreTrainedTokenizer): ...@@ -795,6 +795,12 @@ class WhisperTokenizer(PreTrainedTokenizer):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
......
...@@ -563,6 +563,12 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): ...@@ -563,6 +563,12 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
""" """
A simple chat template that ignores role information and just concatenates messages with EOS tokens. A simple chat template that ignores role information and just concatenates messages with EOS tokens.
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using the default template "
f"for the {self.__class__.__name__} class. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
# Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
......
...@@ -1780,6 +1780,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): ...@@ -1780,6 +1780,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
This template formats inputs in the standard ChatML format. See This template formats inputs in the standard ChatML format. See
https://github.com/openai/openai-python/blob/main/chatml.md https://github.com/openai/openai-python/blob/main/chatml.md
""" """
logger.warning_once(
"\nNo chat template is defined for this tokenizer - using a default chat template "
"that implements the ChatML format. If the default is not appropriate for "
"your model, please set `tokenizer.chat_template` to an appropriate template. "
"See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
)
return ( return (
"{% for message in messages %}" "{% for message in messages %}"
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment