cli_demo.py 1.58 KB
Newer Older
zhouxiang's avatar
zhouxiang committed
1
2
import argparse
from fastllm_pytools import llm
3
import time
zhouxiang's avatar
zhouxiang committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

def args_parser():
    parser = argparse.ArgumentParser(description = 'fastllm_chat_demo')
    parser.add_argument('-p', '--path', type = str, required = True, default = '', help = '模型文件的路径')
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = args_parser()
    model = llm.model(args.path)

    history = []
    print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
    while True:
        query = input("\n用户:")
        if query.strip() == "stop":
            break
        if query.strip() == "clear":
            history = []
            print("输入内容即可进行对话,clear 清空对话历史,stop 终止程序")
            continue
25
26
27
28
29
30
31
32
33
34
35
        print("AI:", end = "")
        curResponse = ""

        prompt = model.get_prompt(query, history)
        tokens = model.tokenizer_encode_string(prompt)
        token_input_count = len(tokens)

        token_count = 0
        t0 = time.time()
        for response in model.stream_response(query, history = history, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.01):
            curResponse += response
zhouxiang's avatar
zhouxiang committed
36
            print(response, flush = True, end = "")
37
38
39
40
41
42
43
44
45
            token_count += 1

        t1 = time.time()
        word_len = len(curResponse)
        print("\n\ntoken_input_count", token_input_count)
        print("\ntoken/s: {:.2f}, character/s: {:.2f}".format(token_count/(t1-t0), word_len/(t1-t0)))

        history.append((query, curResponse))
    model.release_memory()