cli_demo_chatglm3.py

"""
使用python的分词构造指令方式实现chatglm3，需要修改hf_model.py中的chatglm3模型转换时model.config.model_type的赋值实现，不推荐外部使用
"""
import argparse
from fastllm_pytools import llm
import time
from transformers import AutoTokenizer, AutoModel

def args_parser():
    parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
    parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = args_parser()
    model_path = args.path
    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
    # model = llm.model(args.path)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    model = llm.from_hf(model, tokenizer, dtype = "float16")

    history = []
    print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
    while True:
        query = input("\n用户：")
        if query.strip() == "stop":
            break
        if query.strip() == "clear":
            history = []
            print("输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
            continue
        print("AI:", end = "")
        current_length = 0
        token_count = 0
        t0 = time.time()
        for response, history in model.stream_chat(tokenizer, query, history=history):
            print(response[current_length:], end="", flush=True)
            token_count += 1
            current_length = len(response)

        t1 = time.time()
        print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_count/(t1-t0), current_length/(t1-t0)))