profile_generation.py 4.84 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
q.yao's avatar
q.yao committed
2
import os.path as osp
lvhan028's avatar
lvhan028 committed
3
import time
4
5
from queue import Queue
from threading import Thread
lvhan028's avatar
lvhan028 committed
6
7
8
9

import fire
import numpy as np

q.yao's avatar
q.yao committed
10
from lmdeploy.turbomind import Tokenizer, TurboMind
lvhan028's avatar
lvhan028 committed
11
12


q.yao's avatar
q.yao committed
13
14
15
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
16
17
18
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
19
20
        timestamps = []
        tokens = []
21
22
23
24
25
26
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
27
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
28
29
30
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
31
        # TODO: ignore first token
q.yao's avatar
q.yao committed
32
33
34
35
36
37
38
        first_token_latency = timestamps[0] - start
        if len(timestamps) == 1:
            token_latency = timestamps[0] - start
            token = tokens[0]
        else:
            token_latency = timestamps[-1] - timestamps[0]
            token = tokens[-1] - tokens[0]
lvhan028's avatar
lvhan028 committed
39
40
41
42
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
43
def warmup(model, concurrency: int, output_seqlen: int, warmup_round: int = 4):
lvhan028's avatar
lvhan028 committed
44
45
    print('start to warmup ...')

q.yao's avatar
q.yao committed
46
47
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
48
        for _ in range(warmup_round):
49
50
51
52
53
54
            for _ in chatbot.stream_infer(session_id,
                                          input_ids=[1],
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
55
56
57
58
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
59
60
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
61
62
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
63
64
65
66
67
68
69
70

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
71
72
73
74
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
75
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
76
77
78
         concurrency: int = 1,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
79
80
         test_round: int = 10,
         tp: int = 1):
q.yao's avatar
q.yao committed
81
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
q.yao's avatar
q.yao committed
82
    tokenizer = Tokenizer(tokenizer_model_path)
83
    tm_model = TurboMind(model_path=model_path, tp=tp)
q.yao's avatar
q.yao committed
84

q.yao's avatar
q.yao committed
85
    warmup(tm_model, concurrency, output_seqlen)
lvhan028's avatar
lvhan028 committed
86
87
88

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
89
90
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
91
92
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
93
94

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
95
    for i in range(concurrency):
q.yao's avatar
q.yao committed
96
        proc = Thread(target=infer,
97
98
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
99
100
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
101
102
103
104
105
106
107
108

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
127
128
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2],
                                                      axis=0) * concurrency
129
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
130
131
132
133
134
135
136
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
137
          f'throughput: {throughput} token/s\n{"-" * 50}')
lvhan028's avatar
lvhan028 committed
138
139
140
141


if __name__ == '__main__':
    fire.Fire(main)