profile_generation.py 5.11 KB
Newer Older
q.yao's avatar
q.yao committed
1
# import multiprocessing as mp
q.yao's avatar
q.yao committed
2
import os.path as osp
lvhan028's avatar
lvhan028 committed
3
import time
4
5
from queue import Queue
from threading import Thread
lvhan028's avatar
lvhan028 committed
6
7
8

import fire
import numpy as np
9
from transformers import AutoTokenizer
lvhan028's avatar
lvhan028 committed
10

q.yao's avatar
q.yao committed
11
from lmdeploy.model import MODELS
12
from lmdeploy.turbomind import TurboMind
lvhan028's avatar
lvhan028 committed
13
14


q.yao's avatar
q.yao committed
15
16
17
def infer(model, session_id: int, input_ids: str, output_seqlen: int,
          test_round: int, que: Queue):
    chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
18
19
20
    stats = []
    for i in range(test_round):
        start = time.perf_counter()
q.yao's avatar
q.yao committed
21
22
        timestamps = []
        tokens = []
23
24
25
26
27
28
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
                                            request_output_len=output_seqlen,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True):
q.yao's avatar
q.yao committed
29
            res, token = outputs[0]
lvhan028's avatar
lvhan028 committed
30
31
32
            timestamps.append(time.perf_counter())
            tokens.append(token)

q.yao's avatar
q.yao committed
33
        # TODO: ignore first token
q.yao's avatar
q.yao committed
34
35
36
37
38
39
40
        first_token_latency = timestamps[0] - start
        if len(timestamps) == 1:
            token_latency = timestamps[0] - start
            token = tokens[0]
        else:
            token_latency = timestamps[-1] - timestamps[0]
            token = tokens[-1] - tokens[0]
lvhan028's avatar
lvhan028 committed
41
42
43
44
        stats.append([first_token_latency, token, token_latency])
    que.put((session_id, stats))


q.yao's avatar
q.yao committed
45
def warmup(model,
lvhan028's avatar
lvhan028 committed
46
47
48
49
50
51
           concurrency: int,
           session_len: int,
           output_seqlen: int,
           warmup_round: int = 4):
    print('start to warmup ...')

q.yao's avatar
q.yao committed
52
53
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
54
        for _ in range(warmup_round):
55
56
57
58
59
60
            for _ in chatbot.stream_infer(session_id,
                                          input_ids=[1],
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
                                          ignore_eos=True):
lvhan028's avatar
lvhan028 committed
61
62
63
64
                continue

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
65
66
    for i in range(concurrency):
        proc = Thread(target=_infer, args=(model, i + 1))
lvhan028's avatar
lvhan028 committed
67
68
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
69
70
71
72
73
74
75
76

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
77
78
79
80
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


q.yao's avatar
q.yao committed
81
def main(model_path: str,
lvhan028's avatar
lvhan028 committed
82
83
         model_name: str,
         concurrency: int = 1,
84
         session_len: int = 2056,
lvhan028's avatar
lvhan028 committed
85
86
87
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
q.yao's avatar
q.yao committed
88
89
90
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_path,
                                              trust_remote_code=True)
q.yao's avatar
q.yao committed
91
92
93
94
    model = MODELS.get(model_name)()
    stop_words = model.stop_words
    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)

95
    warmup(tm_model, concurrency, session_len, output_seqlen)
lvhan028's avatar
lvhan028 committed
96
97
98

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
q.yao's avatar
q.yao committed
99
100
    input_ids = tokenizer.encode(prompt)
    que = Queue()
lvhan028's avatar
lvhan028 committed
101
102
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
103
104

    # TODO: update to the multithread version
lvhan028's avatar
lvhan028 committed
105
    for i in range(concurrency):
q.yao's avatar
q.yao committed
106
        proc = Thread(target=infer,
107
108
                      args=(tm_model, i + 1, input_ids, output_seqlen,
                            test_round, que))
lvhan028's avatar
lvhan028 committed
109
110
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
111
112
113
114
115
116
117
118

    try:
        for proc in procs:
            proc.join()
    except Exception:
        for proc in procs:
            proc.stop()
        exit(1)
lvhan028's avatar
lvhan028 committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    _end = time.perf_counter()
    elapsed_time = _end - _start

    stats = []
    while not que.empty():
        session_id, _stats = que.get()
        print(f'\n{"-" * 50}\n'
              f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
        stats.append(_stats)

    stats = np.array(stats).reshape(-1, 3)

    first_token_latency_min = np.min(stats[:, 0], axis=0)
    first_token_latency_max = np.max(stats[:, 0], axis=0)
    first_token_latency_ave = np.mean(stats[:, 0], axis=0)
    token_latency_min = np.min(stats[:, 2], axis=0)
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
138
    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
lvhan028's avatar
lvhan028 committed
139
140
141
142
143
144
145
146
147
148
149
150
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '
          f'{first_token_latency_min:.2f}s, {first_token_latency_max:.2f}s, '
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
          f'throughput: {throughput} token/s\n{"-" * 50}')


if __name__ == '__main__':
    fire.Fire(main)