profile_generation.py 4.89 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
q.yao's avatar
q.yao committed
2
import os.path as osp
lvhan028's avatar
lvhan028 committed
3
import time
4
5
from queue import Queue
from threading import Thread
lvhan028's avatar
lvhan028 committed
6
7
8
9

import fire
import numpy as np

q.yao's avatar
q.yao committed
10
from lmdeploy.model import MODELS
q.yao's avatar
q.yao committed
11
from lmdeploy.turbomind import Tokenizer, TurboMind
lvhan028's avatar
lvhan028 committed
12
13


q.yao's avatar
q.yao committed
14
15
16
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
17
18
19
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
20
21
        timestamps = []
        tokens = []
22
23
24
25
26
27
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
28
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
29
30
31
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
32
        # TODO: ignore first token
q.yao's avatar
q.yao committed
33
34
35
36
37
38
39
        first_token_latency = timestamps[0] - start
        if len(timestamps) == 1:
            token_latency = timestamps[0] - start
            token = tokens[0]
        else:
            token_latency = timestamps[-1] - timestamps[0]
            token = tokens[-1] - tokens[0]
lvhan028's avatar
lvhan028 committed
40
41
42
43
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
44
def warmup(model, concurrency: int, output_seqlen: int, warmup_round: int = 4):
lvhan028's avatar
lvhan028 committed
45
46
    print('start to warmup ...')

q.yao's avatar
q.yao committed
47
48
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
49
        for _ in range(warmup_round):
50
51
52
53
54
55
            for _ in chatbot.stream_infer(session_id,
                                          input_ids=[1],
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
56
57
58
59
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
60
61
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
62
63
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
64
65
66
67
68
69
70
71

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
72
73
74
75
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
76
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
77
78
79
80
81
         model_name: str,
         concurrency: int = 1,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
q.yao's avatar
q.yao committed
82
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
q.yao's avatar
q.yao committed
83
    tokenizer = Tokenizer(tokenizer_model_path)
q.yao's avatar
q.yao committed
84
85
86
87
    model = MODELS.get(model_name)()
    stop_words = model.stop_words
    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)

q.yao's avatar
q.yao committed
88
    warmup(tm_model, concurrency, output_seqlen)
lvhan028's avatar
lvhan028 committed
89
90
91

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
92
93
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
94
95
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
96
97

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
98
    for i in range(concurrency):
q.yao's avatar
q.yao committed
99
        proc = Thread(target=infer,
100
101
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
102
103
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
104
105
106
107
108
109
110
111

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
131
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
132
133
134
135
136
137
138
139
140
141
142
143
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
          f'throughput: {throughput} token/s\n{"-" * 50}')


if __name__ == '__main__':
    fire.Fire(main)