profile_generation.py 4.84 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
lvhan028's avatar
lvhan028 committed
2
import time
3
4
from queue import Queue
from threading import Thread
lvhan028's avatar
lvhan028 committed
5
6
7

import fire
import numpy as np
8
from transformers import AutoTokenizer
lvhan028's avatar
lvhan028 committed
9

q.yao's avatar
q.yao committed
10
from lmdeploy.model import MODELS
11
from lmdeploy.turbomind import TurboMind
lvhan028's avatar
lvhan028 committed
12
13


q.yao's avatar
q.yao committed
14
15
16
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
17
18
19
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
20
21
        timestamps = [start]
        tokens = [0]
22
23
24
25
26
27
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
28
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
29
30
31
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
32
33
        # TODO: ignore first token
        first_token_latency = timestamps[1] - start
lvhan028's avatar
lvhan028 committed
34
35
36
37
38
39
        token_latency = timestamps[-1] - timestamps[0]
        token = tokens[-1] - tokens[0]
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
40
def warmup(model,
lvhan028's avatar
lvhan028 committed
41
42
43
44
45
46
           concurrency: int,
           session_len: int,
           output_seqlen: int,
           warmup_round: int = 4):
    print('start to warmup ...')

q.yao's avatar
q.yao committed
47
48
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
49
        for _ in range(warmup_round):
50
51
52
53
54
55
            for _ in chatbot.stream_infer(session_id,
                                          input_ids=[1],
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
56
57
58
59
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
60
61
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
62
63
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
64
65
66
67
68
69
70
71

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
72
73
74
75
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
76
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
77
         model_name: str,
q.yao's avatar
q.yao committed
78
         tokenlizer: str,
lvhan028's avatar
lvhan028 committed
79
         concurrency: int = 1,
80
         session_len: int = 2056,
lvhan028's avatar
lvhan028 committed
81
82
83
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
q.yao's avatar
q.yao committed
84
85
86
87
88
    tokenizer = AutoTokenizer.from_pretrained(tokenlizer)
    model = MODELS.get(model_name)()
    stop_words = model.stop_words
    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)

89
    warmup(tm_model, concurrency, session_len, output_seqlen)
lvhan028's avatar
lvhan028 committed
90
91
92

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
93
94
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
95
96
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
97
98

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
99
    for i in range(concurrency):
q.yao's avatar
q.yao committed
100
        proc = Thread(target=infer,
101
102
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
103
104
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
105
106
107
108
109
110
111
112

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
132
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
133
134
135
136
137
138
139
140
141
142
143
144
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
          f'throughput: {throughput} token/s\n{"-" * 50}')


if __name__ == '__main__':
    fire.Fire(main)