Add default template warning (#26637)

* Add default template warnings * make fixup * Move warnings to FutureWarning * Move warnings to FutureWarning * fix make fixup * Remove futurewarning

Add default template warning (#26637)
* Add default template warnings * make fixup * Move warnings to FutureWarning * Move warnings to FutureWarning * fix make fixup * Remove futurewarning
d933818d · Matt · GitHub · de55ead1 · d933818d · d933818d
Unverified Commit d933818d authored Oct 18, 2023 by Matt Committed by GitHub Oct 18, 2023
18 changed files
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -423,6 +423,12 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
        """
        A very simple chat template that just adds whitespace between messages.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"

--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -305,6 +305,12 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
        """
        A very simple chat template that just adds whitespace between messages.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"

--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -242,6 +242,12 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
        """
        A very simple chat template that just adds whitespace between messages.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"

--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -124,6 +124,12 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
        """
        A very simple chat template that just adds whitespace between messages.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"

--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -168,4 +168,10 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/code_llama/tokenization_code_llama.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama.py
@@ -469,7 +469,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
        in the original repository.
        """
-
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present

--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -367,7 +367,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
        in the original repository.
        """
-
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present

--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -363,4 +363,10 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -181,4 +181,10 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -135,4 +135,10 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -180,6 +180,12 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
        """
        A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{{ bos_token + eos_token + message.content + eos_token }}"

--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -321,6 +321,12 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
        This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
        preceding messages. BOS tokens are added between all messages.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{{ eos_token }}{{ bos_token }}"
            "{% for message in messages %}"

--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -261,6 +261,12 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
        A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role
        information.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{% if not loop.first %}{{ bos_token}}{% endif %}"

--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -430,7 +430,12 @@ class LlamaTokenizer(PreTrainedTokenizer):
        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
        in the original repository.
        """
-
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present

--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -224,7 +224,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
        in the original repository.
        """
-
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present

--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -795,6 +795,12 @@ class WhisperTokenizer(PreTrainedTokenizer):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"

    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):

--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -563,6 +563,12 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using the default template "
+            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"

    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids

--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1780,6 +1780,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        This template formats inputs in the standard ChatML format. See
        https://github.com/openai/openai-python/blob/main/chatml.md
        """
+        logger.warning_once(
+            "\nNo chat template is defined for this tokenizer - using a default chat template "
+            "that implements the ChatML format. If the default is not appropriate for "
+            "your model, please set `tokenizer.chat_template` to an appropriate template. "
+            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
+        )
        return (
            "{% for message in messages %}"
            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"