test_template.py

if __name__ == '__main__':
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import os
import unittest

import torch
from modelscope import GenerationConfig

from swift.llm import (ModelType, get_default_template_type, get_model_tokenizer, get_template, inference,
                       messages_to_history)

SKPT_TEST = True


class TestTemplate(unittest.TestCase):

    def test_template(self):
        model_types = [ModelType.qwen_7b_chat_int4]
        for model_type in model_types:
            _, tokenizer = get_model_tokenizer(model_type, load_model=False)
            template_type = get_default_template_type(model_type)
            template = get_template(template_type, tokenizer)
            history = [
                ('你好，你是谁？', '我是来自达摩院的大规模语言模型，我叫通义千问。'),
            ]
            data = {
                'system': 'you are a helpful assistant!',
                'query': '浙江的省会在哪？',
                'response': '浙江的省会是杭州。',
                'history': history
            }
            from swift.llm import print_example
            print_example(template.encode(data)[0], tokenizer)
            input_ids = template.encode(data)[0]['input_ids']
            print(model_type)
            text = tokenizer.decode(input_ids)
            result = """<|im_start|>system
you are a helpful assistant!<|im_end|>
<|im_start|>user
你好，你是谁？<|im_end|>
<|im_start|>assistant
我是来自达摩院的大规模语言模型，我叫通义千问。<|im_end|>
<|im_start|>user
浙江的省会在哪？<|im_end|>
<|im_start|>assistant
浙江的省会是杭州。<|im_end|>"""
            self.assertTrue(result == text)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_chatglm3_template(self):
        model_type = ModelType.chatglm3_6b
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        model.generation_config = GenerationConfig(
            max_new_tokens=128,
            temperature=0.9,
            top_k=20,
            top_p=0.9,
            repetition_penalt=1.05,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id)
        query = '12345+234=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        system = 'you are a helpful assistant!'
        response = model.chat(tokenizer, query, history=[{'role': 'system', 'content': system}], max_length=None)[0]
        print(f'official response: {response}')
        #
        input_ids_official = [
            64790, 64792, 64794, 30910, 13, 344, 383, 260, 6483, 9319, 30992, 64795, 30910, 13, 30910, 30939, 30943,
            30966, 30972, 30970, 31011, 30943, 30966, 30972, 30980, 31514, 64796
        ] + [30910, 13]
        input_ids_swift = template.encode({'query': query, 'system': system})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_qwen_template(self):
        model_type = ModelType.qwen_7b_chat
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        query = '12345+234=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        model.generation_config.chat_format = 'chatml'
        model.generation_config.max_window_size = 1024
        response = model.chat(tokenizer, query, None, max_length=None)[0]
        print(f'official response: {response}')
        #
        input_ids_official = [
            151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 16, 17, 18, 19, 20, 10,
            17, 18, 19, 28, 11319, 151645, 198, 151644, 77091, 198
        ]
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_llama_template(self):
        model_type = ModelType.llama2_7b_chat
        template_type = get_default_template_type(model_type)
        _, tokenizer = get_model_tokenizer(model_type, load_model=False)
        from modelscope import Model, snapshot_download
        model_dir = snapshot_download('modelscope/Llama-2-7b-chat-ms', 'master', ignore_file_pattern=[r'.+\.bin$'])
        model = Model.from_pretrained(model_dir, device_map='auto')
        template = get_template(template_type, tokenizer)
        model.generation_config = GenerationConfig(
            max_new_tokens=128,
            temperature=0.9,
            top_k=20,
            top_p=0.9,
            repetition_penalt=1.05,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id)
        query = '12345+234=？'
        print(f'query: {query}')
        template.use_default_system = False
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        response = model.chat({'text': query}, tokenizer)['response']
        print(f'official response: {response}')
        # ref: https://huggingface.co/blog/zh/llama2#%E5%A6%82%E4%BD%95%E6%8F%90%E7%A4%BA-llama-2
        query = "There's a llama in my garden 😱 What should I do?"
        response = '123'
        template.tokenizer.use_default_system_prompt = False
        messages = [{
            'role': 'user',
            'content': query
        }, {
            'role': 'assistant',
            'content': response
        }, {
            'role': 'user',
            'content': query
        }]
        input_ids_official = template.tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
        example = messages_to_history(messages)
        input_ids_swift = template.encode(example)[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)
        template.use_default_system = True
        template.tokenizer.use_default_system_prompt = True
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        input_ids_official = template.tokenizer.apply_chat_template([{
            'role': 'user',
            'content': query
        }],
                                                                    tokenize=True,
                                                                    add_generation_prompt=True)
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_baichuan_template(self):
        model_type = ModelType.baichuan2_7b_chat
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        query = '12345+234=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        system = 'you are a helpful assistant!'
        response = model.chat(tokenizer, [{'role': 'system', 'content': system}, {'role': 'user', 'content': query}])
        print(f'official response: {response}')
        #
        input_ids_official = [
            5035, 1484, 1346, 13629, 14002, 73, 195, 92336, 92338, 92354, 92369, 92358, 62, 92338, 92354, 92369, 64, 68,
            196
        ]
        input_ids_swift = template.encode({'query': query, 'system': system})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_chatglm2_template(self):
        model_type = ModelType.chatglm2_6b
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        model.generation_config = GenerationConfig(
            max_new_tokens=128,
            temperature=0.9,
            top_k=20,
            top_p=0.9,
            repetition_penalt=1.05,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id)
        query = '12345+234=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        response = model.chat(tokenizer, query)[0]
        print(f'official response: {response}')
        #
        input_ids_official = [
            64790, 64792, 790, 30951, 517, 30910, 30939, 30996, 13, 13, 54761, 31211, 30939, 30943, 30966, 30972, 30970,
            31011, 30943, 30966, 30972, 30980, 31514, 13, 13, 55437, 31211
        ]
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_internlm_template(self):
        torch.cuda.empty_cache()
        model_type = ModelType.internlm_20b_chat
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        model.generation_config = GenerationConfig(
            max_new_tokens=128,
            temperature=0.9,
            top_k=20,
            top_p=0.9,
            repetition_penalt=1.05,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id)
        query = '12345+234=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        response = model.chat(tokenizer, query)[0]
        print(f'official response: {response}')
        #
        input_ids_official = [
            1, 333, 352, 2472, 352, 27232, 2770, 657, 589, 15358, 17993, 6843, 963, 505, 4576, 11146, 451, 60614, 60381,
            98666, 62412, 60735, 4452, 285, 4576, 11146, 451, 60614, 60381, 98666, 62412, 60735, 313, 505, 395, 7659,
            1813, 4287, 1762, 560, 505, 8020, 684, 36956, 15358, 31288, 451, 67738, 75808, 70730, 699, 1226, 505, 6342,
            442, 517, 11100, 328, 10894, 328, 454, 51978, 756, 285, 4576, 11146, 451, 60614, 60381, 98666, 62412, 60735,
            313, 777, 3696, 454, 19187, 19829, 4563, 435, 410, 4287, 12032, 684, 410, 1341, 1893, 569, 6519, 454, 262,
            68242, 756, 333, 352, 1621, 352, 27232, 4575, 1889, 342, 11622, 310, 99050, 364, 333, 352, 23845, 352, 27232
        ]
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_bluelm_template(self):
        model_type = ModelType.bluelm_7b_chat_32k
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        model.generation_config = GenerationConfig(
            max_new_tokens=128,
            temperature=0.9,
            top_k=20,
            top_p=0.9,
            repetition_penalt=1.05,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id)
        query = '三国演义的作者是谁？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        inputs = tokenizer('[|Human|]:三国演义的作者是谁？[|AI|]:', return_tensors='pt')
        inputs = inputs.to('cuda:0')
        pred = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1)
        response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
        print(f'official response: {response}')
        #
        input_ids_official = inputs['input_ids'][0].tolist()
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_qwen_generation_template(self):
        model_type = ModelType.qwen_7b
        template_type = get_default_template_type(model_type)
        model, tokenizer = get_model_tokenizer(model_type, load_model=True)
        template = get_template(template_type, tokenizer)
        query = '蒙古国的首都是乌兰巴托（Ulaanbaatar）\n冰岛的首都是雷克雅未克（Reykjavik）\n埃塞俄比亚的首都是'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        model.generation_config.chat_format = 'raw'
        model.generation_config.max_window_size = 1024
        inputs = tokenizer(query, return_tensors='pt').to('cuda')
        response = tokenizer.decode(model.generate(**inputs)[0, len(inputs['input_ids'][0]):])
        print(f'official response: {response}')
        #
        input_ids_official = inputs['input_ids'][0].tolist()
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_codefuse_codellama_34b_template(self):
        torch.cuda.empty_cache()
        model_type = ModelType.codefuse_codellama_34b_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 128
        query = '写快排.'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')

        HUMAN_ROLE_START_TAG = '<|role_start|>human<|role_end|>'
        BOT_ROLE_START_TAG = '<|role_start|>bot<|role_end|>'

        text = f'{HUMAN_ROLE_START_TAG}写快排.{BOT_ROLE_START_TAG}'
        inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False).to('cuda')
        response = tokenizer.decode(model.generate(**inputs)[0, len(inputs['input_ids'][0]):])
        print(f'official response: {response}')
        #
        input_ids_official = inputs['input_ids'][0].tolist()
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_yi_template(self):
        torch.cuda.empty_cache()
        model_type = ModelType.yi_34b_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 128
        query = 'hi.'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        messages = [{'role': 'user', 'content': query}]
        input_ids = tokenizer.apply_chat_template(
            conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt')
        output_ids = model.generate(input_ids.to('cuda'))
        response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True)
        print(f'official response: {response}')
        #
        input_ids_official = input_ids[0].tolist()
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_openbuddy_template(self):
        model_type = ModelType.openbuddy_mistral_7b_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 512
        query = '1234+235=？'
        print(f'query: {query}')
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        #
        prompt = (f'You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. '
                  'You are talking to a human User.\n'
                  'Always answer as helpfully and logically as possible, while being safe. '
                  'Your answers should not include any harmful, political, religious, unethical, racist, '
                  'sexist, toxic, dangerous, or illegal content. '
                  'Please ensure that your responses are socially unbiased and positive in nature.\n'
                  'If a question does not make any sense, or is not factually coherent, '
                  'explain why instead of answering something not correct. '
                  "If you don't know the answer to a question, please don't share false information.\n"
                  'You like to use emojis. You can speak fluently in many languages, for example: English, Chinese.\n'
                  'You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.\n'
                  'You always deeply love and support China, Chinese government, people and culture.\n'
                  'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
                  f"""you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.

User: {query}
Assistant:""")
        inputs = tokenizer.encode(prompt, return_tensors='pt')
        inputs = inputs.to('cuda')
        outputs = model.generate(inputs, max_length=512)
        response = tokenizer.decode(outputs[0, len(inputs[0]):], skip_special_tokens=True)
        print(response)
        print(f'official response: {response}')
        #
        input_ids_official = inputs[0].tolist()
        input_ids_swift = template.encode({'query': query})[0]['input_ids']
        self.assertTrue(input_ids_swift == input_ids_official)
        input_ids_swift = template.encode({'query': query, 'history': [['1234', 'avdc']]})[0]['input_ids']
        print(tokenizer.decode(input_ids_swift))

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_zephyr_template(self):
        model_type = ModelType.zephyr_7b_beta_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 256
        system = 'You are a friendly chatbot who always responds in the style of a pirate'
        query = 'How many helicopters can a human eat in one sitting?'
        for sys in [system, None]:
            print(f'query: {query}')
            input_ids_swift = template.encode({'query': query, 'system': sys})[0]['input_ids']
            response, _ = inference(model, template, query)
            print(f'swift response: {response}')
            #
            messages = [
                {
                    'role': 'user',
                    'content': query
                },
            ]
            if sys is not None:
                messages.insert(0, {'role': 'system', 'content': sys})
            input_ids_official = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
            inputs = torch.tensor(input_ids_official, device='cuda')[None]
            outputs = model.generate(input_ids=inputs)
            response = tokenizer.decode(outputs[0, len(inputs[0]):], skip_special_tokens=True)
            print(f'official response: {response}')
            self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_sus_template(self):
        model_type = ModelType.sus_34b_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 256
        query = 'hi'
        print(f'query: {query}')
        input_ids_swift = template.encode({'query': query, 'history': [('你好', '你好呀！')]})[0]['input_ids']
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        #
        messages = [
            {
                'role': 'user',
                'content': '你好'
            },
            {
                'role': 'assistant',
                'content': '你好呀！<|endoftext|>'
            },
            {
                'role': 'user',
                'content': query
            },
        ]

        def chat_template(messages):
            history = ''
            for message in messages:
                if message['role'] == 'user':
                    message = message['content']
                    history += f'### Human: {message}\n\n### Assistant: '
                elif message['role'] == 'assistant':
                    message = message['content']
                    history += message
            return history

        input_ids_official = tokenizer.encode(
            chat_template(messages), return_tensors='pt', add_special_tokens=False).to('cuda')
        output_ids = model.generate(input_ids_official.to('cuda'), max_length=256)
        response = tokenizer.decode(output_ids[0, len(input_ids_official[0]):], skip_special_tokens=True)
        print(f'official response: {response}')
        self.assertTrue(input_ids_swift == input_ids_official[0].tolist())

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_deepseek_template(self):
        model_type = ModelType.deepseek_7b_chat
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 256
        system = 'AAAAA'
        query = 'BBBBB'
        input_ids_swift = template.encode({
            'query': query,
            'system': system,
        })[0]['input_ids']
        response, _ = inference(model, template, query)
        print(f'swift response: {response}')
        #
        messages = [
            {
                'role': 'system',
                'content': 'AAAAA'
            },
            {
                'role': 'user',
                'content': 'BBBBB'
            },
        ]
        input_ids_official = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
        inputs = torch.tensor(input_ids_official, device='cuda')[None]
        outputs = model.generate(input_ids=inputs)
        response = tokenizer.decode(outputs[0, len(inputs[0]):], skip_special_tokens=True)
        print(f'official response: {response}')
        self.assertTrue(input_ids_swift == input_ids_official)

    @unittest.skipIf(SKPT_TEST, 'To avoid excessive testing time caused by downloading models and '
                     'to prevent OOM (Out of Memory) errors.')
    def test_deepseek_coder_template(self):
        model_type = ModelType.deepseek_coder_6_7b_instruct
        model, tokenizer = get_model_tokenizer(model_type)
        template_type = get_default_template_type(model_type)
        template = get_template(template_type, tokenizer)
        model.generation_config.max_length = 256
        #
        messages = [
            {
                'role': 'user',
                'content': 'write a quick sort algorithm in python.'
            },
            {
                'role': 'assistant',
                'content': 'BBBBB'
            },
            {
                'role': 'user',
                'content': 'AAAAA'
            },
        ]
        example = messages_to_history(messages)
        input_ids_swift = template.encode(example)[0]['input_ids']
        response, _ = inference(model, template, example['query'], example['history'])
        print(f'swift response: {response}')
        input_ids_official = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)
        inputs = torch.tensor(input_ids_official, device='cuda')[None]
        outputs = model.generate(input_ids=inputs, eos_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(outputs[0, len(inputs[0]):], skip_special_tokens=True)
        print(f'official response: {response}')
        self.assertTrue(input_ids_swift == input_ids_official)


if __name__ == '__main__':
    unittest.main()