profile_generation.py 4.65 KB
Newer Older
q.yao's avatar
q.yao committed
1
2
3
# import multiprocessing as mp
from threading import Thread
from queue import Queue
lvhan028's avatar
lvhan028 committed
4
5
6
7
8
import time

import fire
import numpy as np

q.yao's avatar
q.yao committed
9
10
11
from lmdeploy.turbomind import TurboMind
from lmdeploy.model import MODELS
from transformers import AutoTokenizer
lvhan028's avatar
lvhan028 committed
12
13


q.yao's avatar
q.yao committed
14
15
16
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
17
18
19
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
20
21
22
        timestamps = [start]
        tokens = [0]
        for outputs in chatbot.stream_infer(
lvhan028's avatar
lvhan028 committed
23
                session_id,
q.yao's avatar
q.yao committed
24
                input_ids,
lvhan028's avatar
lvhan028 committed
25
26
                request_output_len=output_seqlen,
                sequence_start=True,
q.yao's avatar
q.yao committed
27
28
29
                sequence_end=True,
                ignore_eos=True):
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
30
31
32
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
33
34
        # TODO: ignore first token
        first_token_latency = timestamps[1] - start
lvhan028's avatar
lvhan028 committed
35
36
37
38
39
40
        token_latency = timestamps[-1] - timestamps[0]
        token = tokens[-1] - tokens[0]
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
41
def warmup(model,
lvhan028's avatar
lvhan028 committed
42
43
44
45
46
47
           concurrency: int,
           session_len: int,
           output_seqlen: int,
           warmup_round: int = 4):
    print('start to warmup ...')

q.yao's avatar
q.yao committed
48
49
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
50
        for _ in range(warmup_round):
q.yao's avatar
q.yao committed
51
            for _ in chatbot.stream_infer(
lvhan028's avatar
lvhan028 committed
52
                    session_id,
q.yao's avatar
q.yao committed
53
                    input_ids=[1],
lvhan028's avatar
lvhan028 committed
54
55
                    request_output_len=output_seqlen,
                    sequence_start=True,
q.yao's avatar
q.yao committed
56
57
                    sequence_end=True,
                    ignore_eos=True):
lvhan028's avatar
lvhan028 committed
58
59
60
61
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
62
63
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
64
65
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
66
67
68
69
70
71
72
73

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
74
75
76
77
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
78
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
79
         model_name: str,
q.yao's avatar
q.yao committed
80
         tokenlizer: str,
lvhan028's avatar
lvhan028 committed
81
         concurrency: int = 1,
82
         session_len: int = 2056,
lvhan028's avatar
lvhan028 committed
83
84
85
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
q.yao's avatar
q.yao committed
86
87
88
89
90
91
    tokenizer = AutoTokenizer.from_pretrained(tokenlizer)
    model = MODELS.get(model_name)()
    stop_words = model.stop_words
    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)

    warmup(tm_model, concurrency, session_len,
lvhan028's avatar
lvhan028 committed
92
93
94
95
           output_seqlen)

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
96
97
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
98
99
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
100
101

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
102
    for i in range(concurrency):
q.yao's avatar
q.yao committed
103
104
        proc = Thread(target=infer,
                          args=(tm_model, i + 1, input_ids, output_seqlen,
lvhan028's avatar
lvhan028 committed
105
106
107
                                test_round, que))
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
108
109
110
111
112
113
114
115

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
135
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
136
137
138
139
140
141
142
143
144
145
146
147
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
          f'throughput: {throughput} token/s\n{"-" * 50}')


if __name__ == '__main__':
    fire.Fire(main)