profile_generation.py 4.98 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
q.yao's avatar
q.yao committed
2
import os.path as osp
lvhan028's avatar
lvhan028 committed
3
import time
4
5
from queue import Queue
from threading import Thread
6
from typing import List
lvhan028's avatar
lvhan028 committed
7
8
9
10

import fire
import numpy as np

q.yao's avatar
q.yao committed
11
from lmdeploy.turbomind import Tokenizer, TurboMind
lvhan028's avatar
lvhan028 committed
12
13


q.yao's avatar
q.yao committed
14
15
16
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
17
18
19
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
20
21
        timestamps = []
        tokens = []
22
23
24
25
26
27
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
28
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
29
30
31
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
32
        # TODO: ignore first token
33
        first_token_latency = np.round(timestamps[0] - start, 2)
q.yao's avatar
q.yao committed
34
        if len(timestamps) == 1:
35
            token_latency = np.round(timestamps[0] - start, 2)
q.yao's avatar
q.yao committed
36
37
            token = tokens[0]
        else:
38
            token_latency = np.round(timestamps[-1] - timestamps[0], 2)
q.yao's avatar
q.yao committed
39
            token = tokens[-1] - tokens[0]
lvhan028's avatar
lvhan028 committed
40
41
42
43
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


44
45
46
47
48
def warmup(model,
           concurrency: int,
           input_ids: List[int],
           output_seqlen: int,
           warmup_round: int = 2):
lvhan028's avatar
lvhan028 committed
49
50
    print('start to warmup ...')

q.yao's avatar
q.yao committed
51
52
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
53
        for _ in range(warmup_round):
54
            for _ in chatbot.stream_infer(session_id,
55
                                          input_ids=input_ids,
56
57
58
59
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
60
61
62
63
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
64
65
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
66
67
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
68
69
70
71
72
73
74
75

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
76
77
78
79
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
80
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
81
82
83
         concurrency: int = 1,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
84
85
         test_round: int = 10,
         tp: int = 1):
q.yao's avatar
q.yao committed
86
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
q.yao's avatar
q.yao committed
87
    tokenizer = Tokenizer(tokenizer_model_path)
88
    tm_model = TurboMind(model_path=model_path, tp=tp)
q.yao's avatar
q.yao committed
89

lvhan028's avatar
lvhan028 committed
90
91
    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
92
    input_ids = tokenizer.encode(prompt)
93
94
95

    warmup(tm_model, concurrency, input_ids, output_seqlen)

q.yao's avatar
q.yao committed
96
    que = Queue()
lvhan028's avatar
lvhan028 committed
97
98
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
99
100

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
101
    for i in range(concurrency):
q.yao's avatar
q.yao committed
102
        proc = Thread(target=infer,
103
104
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
105
106
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
107
108
109
110
111
112
113
114

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
133
134
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2],
                                                      axis=0) * concurrency
135
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
136
137
138
139
140
141
142
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
143
          f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
lvhan028's avatar
lvhan028 committed
144
145
146
147


if __name__ == '__main__':
    fire.Fire(main)