profile_generation.py 4.81 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
q.yao's avatar
q.yao committed
2
import os.path as osp
lvhan028's avatar
lvhan028 committed
3
import time
4
5
from queue import Queue
from threading import Thread
lvhan028's avatar
lvhan028 committed
6
7
8
9

import fire
import numpy as np

q.yao's avatar
q.yao committed
10
from lmdeploy.turbomind import Tokenizer, TurboMind
lvhan028's avatar
lvhan028 committed
11
12


q.yao's avatar
q.yao committed
13
14
15
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
16
17
18
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
19
20
        timestamps = []
        tokens = []
21
22
23
24
25
26
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
27
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
28
29
30
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
31
        # TODO: ignore first token
q.yao's avatar
q.yao committed
32
33
34
35
36
37
38
        first_token_latency = timestamps[0] - start
        if len(timestamps) == 1:
            token_latency = timestamps[0] - start
            token = tokens[0]
        else:
            token_latency = timestamps[-1] - timestamps[0]
            token = tokens[-1] - tokens[0]
lvhan028's avatar
lvhan028 committed
39
40
41
42
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
43
def warmup(model, concurrency: int, output_seqlen: int, warmup_round: int = 4):
lvhan028's avatar
lvhan028 committed
44
45
    print('start to warmup ...')

q.yao's avatar
q.yao committed
46
47
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
48
        for _ in range(warmup_round):
49
50
51
52
53
54
            for _ in chatbot.stream_infer(session_id,
                                          input_ids=[1],
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
55
56
57
58
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
59
60
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
61
62
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
63
64
65
66
67
68
69
70

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
71
72
73
74
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
75
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
76
77
78
79
         concurrency: int = 1,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
q.yao's avatar
q.yao committed
80
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
q.yao's avatar
q.yao committed
81
    tokenizer = Tokenizer(tokenizer_model_path)
82
    tm_model = TurboMind(model_path=model_path)
q.yao's avatar
q.yao committed
83

q.yao's avatar
q.yao committed
84
    warmup(tm_model, concurrency, output_seqlen)
lvhan028's avatar
lvhan028 committed
85
86
87

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
88
89
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
90
91
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
92
93

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
94
    for i in range(concurrency):
q.yao's avatar
q.yao committed
95
        proc = Thread(target=infer,
96
97
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
98
99
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
100
101
102
103
104
105
106
107

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
126
127
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2],
                                                      axis=0) * concurrency
128
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
129
130
131
132
133
134
135
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
136
          f'throughput: {throughput} token/s\n{"-" * 50}')
lvhan028's avatar
lvhan028 committed
137
138
139
140


if __name__ == '__main__':
    fire.Fire(main)