profile_generation.py 17.8 KB
Newer Older
1
2
3
4
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import csv
import os
lvhan028's avatar
lvhan028 committed
5
import time
6
from dataclasses import dataclass
7
8
from queue import Queue
from threading import Thread
zhouxiang's avatar
zhouxiang committed
9
from typing import List, Union
lvhan028's avatar
lvhan028 committed
10
11

import numpy as np
zhouxiang's avatar
zhouxiang committed
12
13
14
15
from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
16
from tqdm import tqdm
lvhan028's avatar
lvhan028 committed
17

zhouxiang's avatar
zhouxiang committed
18
19
20
from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter
from lmdeploy.messages import (EngineGenerationConfig, PytorchEngineConfig,
                               TurbomindEngineConfig)
lvhan028's avatar
lvhan028 committed
21
22


zhouxiang's avatar
zhouxiang committed
23
24
def infer(model, session_id: int, input_ids: List,
          gen_config: EngineGenerationConfig, test_round: int, que: Queue):
25
26
    if session_id == 1:
        pbar = tqdm(total=test_round)
q.yao's avatar
q.yao committed
27
    chatbot = model.create_instance()
zhouxiang's avatar
zhouxiang committed
28
    output_seqlen = gen_config.max_new_tokens
lvhan028's avatar
lvhan028 committed
29
    stats = []
30
31
32
    for _ in range(test_round):
        token_latency_stats = [0] * (output_seqlen + 1)
        prev = time.perf_counter()
33
        n_prev_token = 0
34
35
36
37
38
39
        """
        The iterator provided by `stream_infer` denotes the number of generated tokens so far,
        which is represented by the variable `n_token`.
        Please note that `n_token` is not a continuous value. In other words, during the iteration,
        its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
        So, it is quite difficult to get the latency of each generated token.
40
        As a work-around, we set the latency `now-prev` of each iteration to the first token of
41
42
43
44
45
        the new generated tokens, and leave the latency of the rest tokens being 0.
        For example, in the first iteration, 5 tokens are generated.
        The time elapsing in this iteration `now-prev` is set to the latency of first token of
        the 5 tokens, i.e. `token_latency_stats[0]`, and `token_latency_stats[1:4]` is set 0`
        """   # noqa: E501
46
47
        for outputs in chatbot.stream_infer(session_id,
                                            input_ids,
zhouxiang's avatar
zhouxiang committed
48
                                            gen_config=gen_config,
49
50
                                            sequence_start=True,
                                            sequence_end=True,
zhouxiang's avatar
zhouxiang committed
51
52
                                            stream_output=True):
            _, res, n_token = outputs
53
            now = time.perf_counter()
54
55
56
            if n_prev_token != n_token:
                token_latency_stats[n_prev_token] = np.round(now - prev, 3)
                n_prev_token = n_token
57
            prev = now
zhouxiang's avatar
zhouxiang committed
58
59
60
        # for pytorch engine to restart a session
        if hasattr(chatbot, 'end'):
            chatbot.end(session_id)
61
62
        if session_id == 1:
            pbar.update(1)
63
64
65
66
67

        assert output_seqlen <= n_token <= output_seqlen + 1, \
            f'Error. session_id({session_id}) request {output_seqlen} ' \
            f'tokens, but generate {n_token} tokens'
        stats.append(token_latency_stats[:output_seqlen])
lvhan028's avatar
lvhan028 committed
68
69
70
    que.put((session_id, stats))


zhouxiang's avatar
zhouxiang committed
71
72
def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int,
           gen_config: EngineGenerationConfig):
73
74
75
    if not warmup_round:
        return

lvhan028's avatar
lvhan028 committed
76
    print('start to warmup ...')
zhouxiang's avatar
zhouxiang committed
77
    output_seqlen = gen_config.max_new_tokens
lvhan028's avatar
lvhan028 committed
78

q.yao's avatar
q.yao committed
79
80
    def _infer(model, session_id):
        chatbot = model.create_instance()
lvhan028's avatar
lvhan028 committed
81
        for _ in range(warmup_round):
82
            for _ in chatbot.stream_infer(session_id,
83
                                          input_ids=input_ids,
84
85
86
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
87
                                          ignore_eos=True,
zhouxiang's avatar
zhouxiang committed
88
                                          gen_config=gen_config):
lvhan028's avatar
lvhan028 committed
89
                continue
zhouxiang's avatar
zhouxiang committed
90
91
92
            # for pytorch engine to restart a session
            if hasattr(chatbot, 'end'):
                chatbot.end(session_id)
lvhan028's avatar
lvhan028 committed
93
94
95

    _start = time.perf_counter()
    procs = []
q.yao's avatar
q.yao committed
96
    for i in range(concurrency):
zhouxiang's avatar
zhouxiang committed
97
        proc = Thread(target=_infer, args=(model, i + 1), daemon=True)
lvhan028's avatar
lvhan028 committed
98
99
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
100

101
102
103
    for proc in procs:
        proc.join()

lvhan028's avatar
lvhan028 committed
104
105
106
107
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


108
def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
zhouxiang's avatar
zhouxiang committed
109
110
111
112
113
                       engine_config: Union[PytorchEngineConfig,
                                            TurbomindEngineConfig],
                       gen_config: EngineGenerationConfig, test_round: int,
                       warmup_round: int):
    output_seqlen = gen_config.max_new_tokens
114
115
116
117
    print(f'profiling ... concurrency: {concurrency}, '
          f'n_prompt_token: {input_seqlen}, '
          f'n_completion_token: {output_seqlen}, '
          f'test_round: {test_round}, warmup_round: {warmup_round}')
zhouxiang's avatar
zhouxiang committed
118
119
120
121
122
123
124
    if isinstance(engine_config, TurbomindEngineConfig):
        from lmdeploy.turbomind import TurboMind
        tm_model = TurboMind.from_pretrained(model_path,
                                             engine_config=engine_config)
    elif isinstance(engine_config, PytorchEngineConfig):
        from lmdeploy.pytorch.engine import Engine
        tm_model = Engine(model_path, engine_config)
q.yao's avatar
q.yao committed
125

126
    # make up a dummy `input_ids` with the length of `input_seqlen` exactly
127
    assert input_seqlen > 0, 'input_seqlen should > 0'
128
    input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
zhouxiang's avatar
zhouxiang committed
129
    warmup(tm_model, concurrency, input_ids, warmup_round, gen_config)
130

q.yao's avatar
q.yao committed
131
    que = Queue()
lvhan028's avatar
lvhan028 committed
132
133
    procs = []
    _start = time.perf_counter()
q.yao's avatar
q.yao committed
134

lvhan028's avatar
lvhan028 committed
135
    for i in range(concurrency):
q.yao's avatar
q.yao committed
136
        proc = Thread(target=infer,
zhouxiang's avatar
zhouxiang committed
137
138
                      args=(tm_model, i + 1, input_ids, gen_config, test_round,
                            que))
lvhan028's avatar
lvhan028 committed
139
140
        procs.append(proc)
        proc.start()
q.yao's avatar
q.yao committed
141

142
143
144
    for proc in procs:
        proc.join()

lvhan028's avatar
lvhan028 committed
145
146
147
    _end = time.perf_counter()
    elapsed_time = _end - _start

148
    token_latency_stats = []
lvhan028's avatar
lvhan028 committed
149
    while not que.empty():
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
        _, _stats = que.get()
        token_latency_stats += _stats

    # The shape is [concurrency*test_round, output_seqlen]
    token_latency_stats = np.stack(token_latency_stats, axis=0)

    first_token_latency_min = np.round(
        np.min(token_latency_stats[:, 0], axis=0), 3)
    first_token_latency_max = np.round(
        np.max(token_latency_stats[:, 0], axis=0), 3)
    first_token_latency_ave = np.round(
        np.mean(token_latency_stats[:, 0], axis=0), 3)
    token_latency_max = np.round(np.max(np.sum(token_latency_stats, axis=1)),
                                 3)
    token_latency_min = np.round(np.min(np.sum(token_latency_stats, axis=1)),
                                 3)
    token_latency_ave = np.round(np.mean(np.sum(token_latency_stats, axis=1)),
                                 3)
    # sort token_latency without the first token's latency
    sorted_token_latency = np.sort(token_latency_stats[:, 1:].flatten())
    percentiles = [
        np.round(
            sorted_token_latency[int(percent * len(sorted_token_latency))], 3)
        for percent in [0.5, 0.75, 0.95, 0.99]
    ]

    throughput = np.round(token_latency_stats.size / elapsed_time, 2)
    print(f'\n{"-" * 50}\ntotal time: {elapsed_time:.2f}s\n'
          f'concurrency: {concurrency}, test_round: {test_round}\n'
          f'input_tokens: {input_seqlen}, output_tokens: {output_seqlen}\n'
lvhan028's avatar
lvhan028 committed
180
          f'first_token latency(min, max, ave): '
181
182
183
184
185
186
187
188
189
190
          f'{first_token_latency_min}s, {first_token_latency_max}s, '
          f'{first_token_latency_ave}s\ntotal_token latency(min, max, ave): '
          f'{token_latency_min}s, {token_latency_max}s, '
          f'{token_latency_ave}s\n'
          f'token_latency percentiles(50%,75%,95%,99%)(s): {percentiles}\n'
          f'throughput: {throughput} token/s\n{"-" * 50}')
    return tm_model.model_name, \
        [first_token_latency_min, first_token_latency_max,
         first_token_latency_ave], \
        percentiles, throughput, tm_model.gpu_count
191
192


zhouxiang's avatar
zhouxiang committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
class MemoryMonitor:
    from multiprocessing import Manager
    max_mem = Manager().Value('f', 0)  # GB
    device_count = Manager().Value('f', 0)

    @staticmethod
    def nvidia_info():
        # pip install nvidia-ml-py
        nvidia_dict = {
            'state': True,
            'nvidia_version': '',
            'nvidia_count': 0,
            'gpus': []
        }
        try:
            nvmlInit()
            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
            for i in range(nvidia_dict['nvidia_count']):
                handle = nvmlDeviceGetHandleByIndex(i)
                memory_info = nvmlDeviceGetMemoryInfo(handle)
                gpu = {
                    'gpu_name': nvmlDeviceGetName(handle),
                    'total': memory_info.total,
                    'free': memory_info.free,
                    'used': memory_info.used,
                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
                    'powerStatus': nvmlDeviceGetPowerState(handle)
                }
                nvidia_dict['gpus'].append(gpu)
        except NVMLError as _:  # noqa
            nvidia_dict['state'] = False
        except Exception as _:  # noqa
            nvidia_dict['state'] = False
        finally:
            try:
                nvmlShutdown()
            except:  # noqa
                pass
        return nvidia_dict

    @classmethod
    def mem_monitor(cls):
        info = cls.nvidia_info()
        max_mem = 0
        mem_start = 0
        cls.device_count.value = len(info['gpus'])
        for used_total in info['gpus']:
            mem_start += used_total['used']
        while True:
            info = cls.nvidia_info()
            used = 0
            for used_total in info['gpus']:
                used += used_total['used']
            if used > max_mem:
                max_mem = used
                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)

    @classmethod
    def start(cls):
        cls._running = True
        from multiprocessing import Process
        cls.proc = Process(target=cls.mem_monitor, daemon=True)
        cls.proc.start()

    @classmethod
    def terminate(cls) -> float:
        """Terminate the subprocess and return maximum memory."""
        cls.proc.kill()
        return cls.max_mem.value
263
264
265
266
267
268
269
270


@dataclass
class ProfileResult:
    model_name: str
    batch: int
    prompt_tokens: int
    completion_tokens: int
271
272
    first_token_latency: List
    percentiles: List
273
274
275
276
277
278
279
280
    throughput_per_proc: float
    throughput_per_node: float
    mem_per_proc: float
    mem_per_gpu: float
    mem_per_node: float


def parse_args():
zhouxiang's avatar
zhouxiang committed
281
282
283
284
    parser = argparse.ArgumentParser(
        description='Profile the token generation performance with'
        ' pytorch or turbomind engine',
        formatter_class=DefaultsAndTypesHelpFormatter)
285
    parser.add_argument('model_path',
286
                        type=str,
287
288
                        help='the path of the model in localhost or '
                        'the repo_id of the model in huggingface.co')
zhouxiang's avatar
zhouxiang committed
289
290
    parser.add_argument('-c',
                        '--concurrency',
291
292
293
                        nargs='+',
                        type=int,
                        help='how many requests launched concurrently',
294
                        default=[1, 16, 32, 64])
295
    parser.add_argument(
zhouxiang's avatar
zhouxiang committed
296
        '-pt',
297
298
299
        '--prompt-tokens',
        nargs='+',
        type=int,
zhouxiang's avatar
zhouxiang committed
300
        help='how many requests launched concurrently. One-to-one '
301
        'correspondence with completion-tokens',
302
        default=[1, 128, 128, 2048, 2048])
zhouxiang's avatar
zhouxiang committed
303
304
    parser.add_argument('-ct',
                        '--completion-tokens',
305
306
307
308
                        nargs='+',
                        type=int,
                        help='how many tokens to be generated. One-to-one'
                        'correspondence with prompt-tokens',
309
310
                        default=[128, 128, 2048, 128, 2048])
    parser.add_argument('--csv',
311
312
313
                        type=str,
                        help='Where to save the result.',
                        default='profile_generation.csv')
zhouxiang's avatar
zhouxiang committed
314
315
    parser.add_argument('-tr',
                        '--test-round',
316
317
                        type=int,
                        help='number of test rounds',
zhouxiang's avatar
zhouxiang committed
318
319
320
                        default=3)
    parser.add_argument('-w',
                        '--warmup-round',
321
                        type=int,
zhouxiang's avatar
zhouxiang committed
322
                        help='number of warmup rounds',
323
                        default=1)
zhouxiang's avatar
zhouxiang committed
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    # other args
    ArgumentHelper.top_p(parser)
    ArgumentHelper.temperature(parser)
    ArgumentHelper.top_k(parser)
    ArgumentHelper.log_level(parser)
    ArgumentHelper.backend(parser)
    # pytorch engine args
    pt_group = parser.add_argument_group('PyTorch engine arguments')
    tp_act = ArgumentHelper.tp(pt_group)
    cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
    session_len_act = ArgumentHelper.session_len(pt_group, default=2048)

    # turbomind engine args
    tb_group = parser.add_argument_group('TurboMind engine argument')
    tb_group._group_actions.append(tp_act)
    tb_group._group_actions.append(session_len_act)
    tb_group._group_actions.append(cache_count_act)
    ArgumentHelper.model_format(tb_group, default='hf')
342
343
344
345
    args = parser.parse_args()
    return args


zhouxiang's avatar
zhouxiang committed
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
def __proc_cb(*args, ret_pipe, target):
    try:
        ret = target(*args)
        ret_pipe[1].send(ret)
    except Exception as e:
        ret_pipe[1].send(e)


def _process_map(target, iterable):
    from multiprocessing import Pipe, get_context

    pipe = Pipe(False)
    spawn_context = get_context('spawn')
    proc = spawn_context.Process(target=__proc_cb,
                                 args=iterable,
                                 kwargs=dict(ret_pipe=pipe, target=target))
    proc.start()
    proc.join()

    ret = pipe[0].recv()
    if isinstance(ret, Exception):
        raise ret

    return ret


372
373
def main():
    args = parse_args()
374
375
376
377
    assert len(args.prompt_tokens) == len(args.completion_tokens), \
        f'mismatched size between `prompt-tokens` and `completion-tokenes`' \
        f', {len(args.prompt_tokens)} vs {len(args.completion_tokens)}'

378
379
    os.environ['TM_LOG_LEVEL'] = args.log_level
    results: List[ProfileResult] = []
zhouxiang's avatar
zhouxiang committed
380

381
382
383
    for batch in args.concurrency:
        for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
                                                    args.completion_tokens):
zhouxiang's avatar
zhouxiang committed
384
            MemoryMonitor.start()
385
            from functools import partial
zhouxiang's avatar
zhouxiang committed
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417

            # make sure session_len >= prompt_tokens + completion_tokens
            session_len = max(args.session_len,
                              prompt_tokens + completion_tokens)
            if args.backend == 'turbomind':
                engine_config = TurbomindEngineConfig(
                    cache_max_entry_count=args.cache_max_entry_count,
                    model_format=args.model_format,
                    session_len=session_len,
                    tp=args.tp)
            elif args.backend == 'pytorch':
                engine_config = PytorchEngineConfig(
                    cache_max_entry_count=args.cache_max_entry_count,
                    session_len=session_len,
                    tp=args.tp,
                    thread_safe=True)
            gen_config = EngineGenerationConfig(
                top_k=args.top_k,
                top_p=args.top_p,
                temperature=args.temperature,
                max_new_tokens=completion_tokens,
                ignore_eos=True)
            profile_target = partial(
                profile_throughput,
                concurrency=batch,
                input_seqlen=prompt_tokens,
                engine_config=engine_config,
                gen_config=gen_config,
                test_round=args.test_round,
                warmup_round=args.warmup_round,
            )
            output = _process_map(profile_target, (args.model_path, ))
418
            model_name, first_token_latency, percentiles, \
zhouxiang's avatar
zhouxiang committed
419
                throughput_per_proc, tp = output
420
            time.sleep(5)  # wait a while for releasing GPU mem
zhouxiang's avatar
zhouxiang committed
421
422
            memory = MemoryMonitor.terminate()
            device_count = MemoryMonitor.device_count.value
423
424
425
426
427
            results.append(
                ProfileResult(model_name=model_name,
                              batch=batch,
                              prompt_tokens=prompt_tokens,
                              completion_tokens=completion_tokens,
428
429
                              first_token_latency=first_token_latency,
                              percentiles=percentiles,
430
431
432
433
434
435
                              throughput_per_proc=throughput_per_proc,
                              throughput_per_node=throughput_per_proc / tp *
                              device_count,
                              mem_per_proc=memory,
                              mem_per_gpu=memory / tp,
                              mem_per_node=memory / tp * device_count))
436
437
438
    if args.csv:
        with open(args.csv, 'w') as csvfile:
            writer = csv.writer(csvfile)
439
            writer.writerow([
440
441
442
443
444
445
446
447
448
449
450
451
                'batch',
                'prompt_tokens',
                'completion_tokens',
                'throughput(out tok/s)',
                'mem(GB)',
                'FTL(ave)(s)',
                'FTL(min)(s)',
                'FTL(max)(s)',
                '50%(s)',
                '75%(s)',
                '95%(s)',
                '99%(s)',
452
            ])
453
454
455
            for re in results:
                writer.writerow([
                    re.batch, re.prompt_tokens, re.completion_tokens,
456
457
458
459
                    f'{re.throughput_per_proc:.2f}', f'{re.mem_per_gpu:.2f}',
                    re.first_token_latency[2], re.first_token_latency[0],
                    re.first_token_latency[1], re.percentiles[0],
                    re.percentiles[1], re.percentiles[2], re.percentiles[3]
460
                ])
lvhan028's avatar
lvhan028 committed
461
462
463


if __name__ == '__main__':
464
    main()