Profile token generation with more settings (#364)

* better profiler * wait for releasing mem * remove fire * remove support for multiple model benchmark * comments * output more details * correct tp

Profile token generation with more settings (#364)
* better profiler * wait for releasing mem * remove fire * remove support for multiple model benchmark * comments * output more details * correct tp
dfa67e8c · AllentDan · GitHub · abe9f7bd · dfa67e8c · dfa67e8c
Unverified Commit dfa67e8c authored Sep 18, 2023 by AllentDan Committed by GitHub Sep 18, 2023
Show whitespace changes
Inline Side-by-side

Showing with 193 additions and 10 deletions

benchmark/README.md benchmark/README.md +6 -2

benchmark/profile_generation.py benchmark/profile_generation.py +187 -8

No files found.
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -23,10 +23,14 @@ python profile_throughput.py \
 `profile_generation.py` perform benchmark with dummy data.
+```shell
+pip install nvidia-ml-py
+```
 ```bash
 python profile_generation.py \
- /path/to/your/model \
+ --model-path /path/to/your/model \
- --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+ --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
 ```
 ## profile serving

--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
+# Copyright (c) OpenMMLab. All rights reserved.
 # import multiprocessing as mp
+import argparse
+import csv
+import logging
+import os
 import os.path as osp
 import time
+from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
 from typing import List
-import fire
 import numpy as np
+from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
+                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
+                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
+                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
+from tqdm import tqdm
 from lmdeploy.turbomind import Tokenizer, TurboMind
@@ -77,7 +87,7 @@ def warmup(model,
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
-def main(model_path: str,
+def profile_throughput(model_path: str,
                       concurrency: int = 1,
                       input_seqlen: int = 0,
                       output_seqlen: int = 512,
@@ -141,7 +151,176 @@ def main(model_path: str,
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
          f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
+    return tm_model.model_name, throughput, tm_model.gpu_count
+class MemoryMonitor:
+    from multiprocessing import Manager
+    max_mem = Manager().Value('f', 0)  # GB
+    device_count = Manager().Value('f', 0)
+    @staticmethod
+    def nvidia_info():
+        # pip install nvidia-ml-py
+        nvidia_dict = {
+            'state': True,
+            'nvidia_version': '',
+            'nvidia_count': 0,
+            'gpus': []
+        }
+        try:
+            nvmlInit()
+            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
+            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
+            for i in range(nvidia_dict['nvidia_count']):
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memory_info = nvmlDeviceGetMemoryInfo(handle)
+                gpu = {
+                    'gpu_name': nvmlDeviceGetName(handle),
+                    'total': memory_info.total,
+                    'free': memory_info.free,
+                    'used': memory_info.used,
+                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
+                    'powerStatus': nvmlDeviceGetPowerState(handle)
+                }
+                nvidia_dict['gpus'].append(gpu)
+        except NVMLError as _:  # noqa
+            nvidia_dict['state'] = False
+        except Exception as _:  # noqa
+            nvidia_dict['state'] = False
+        finally:
+            try:
+                nvmlShutdown()
+            except:  # noqa
+                pass
+        return nvidia_dict
+    @classmethod
+    def mem_monitor(cls):
+        info = cls.nvidia_info()
+        max_mem = 0
+        mem_start = 0
+        cls.device_count.value = len(info['gpus'])
+        for used_total in info['gpus']:
+            mem_start += used_total['used']
+        while True:
+            info = cls.nvidia_info()
+            used = 0
+            for used_total in info['gpus']:
+                used += used_total['used']
+            if used > max_mem:
+                max_mem = used
+                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
+    @classmethod
+    def start(cls):
+        cls._running = True
+        from multiprocessing import Process
+        cls.proc = Process(target=cls.mem_monitor)
+        cls.proc.start()
+    @classmethod
+    def terminate(cls) -> float:
+        """Terminate the subprocess and return maximum memory."""
+        cls.proc.kill()
+        return cls.max_mem.value
+@dataclass
+class ProfileResult:
+    model_name: str
+    batch: int
+    prompt_tokens: int
+    completion_tokens: int
+    throughput_per_proc: float
+    throughput_per_node: float
+    mem_per_proc: float
+    mem_per_gpu: float
+    mem_per_node: float
+def parse_args():
+    parser = argparse.ArgumentParser(description='Regression Test')
+    parser.add_argument('--model-path',
+                        type=str,
+                        help='benchmark test model path')
+    parser.add_argument('--concurrency',
+                        nargs='+',
+                        type=int,
+                        help='how many requests launched concurrently',
+                        default=[1, 8, 16, 32])
+    parser.add_argument(
+        '--prompt-tokens',
+        nargs='+',
+        type=int,
+        help='how many requests launched concurrently. One-to-one'
+        'correspondence with completion-tokens',
+        default=[64, 512, 512, 1024])
+    parser.add_argument('--completion-tokens',
+                        nargs='+',
+                        type=int,
+                        help='how many tokens to be generated. One-to-one'
+                        'correspondence with prompt-tokens',
+                        default=[512, 512, 1024, 1024])
+    parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
+    parser.add_argument('--dst-csv',
+                        type=str,
+                        help='Where to save the result.',
+                        default='profile_generation.csv')
+    parser.add_argument('--log-level',
+                        help='set log level',
+                        default='INFO',
+                        choices=list(logging._nameToLevel.keys()))
+    args = parser.parse_args()
+    return args
+def main():
+    args = parse_args()
+    os.environ['TM_LOG_LEVEL'] = args.log_level
+    results: List[ProfileResult] = []
+    for batch in tqdm(args.concurrency):
+        for prompt_tokens, completion_tokens in tqdm(
+                zip(args.prompt_tokens, args.completion_tokens)):
+            MemoryMonitor.start()
+            from functools import partial
+            from multiprocessing import Pool
+            profile_target = partial(profile_throughput,
+                                     concurrency=batch,
+                                     input_seqlen=prompt_tokens,
+                                     output_seqlen=completion_tokens,
+                                     tp=args.tp)
+            output = Pool(1).map(profile_target, (args.model_path, ))
+            model_name, throughput_per_proc, tp = output[0]
+            time.sleep(5)  # wait a while for releasing GPU mem
+            memory = MemoryMonitor.terminate()
+            device_count = MemoryMonitor.device_count.value
+            results.append(
+                ProfileResult(model_name=model_name,
+                              batch=batch,
+                              prompt_tokens=prompt_tokens,
+                              completion_tokens=completion_tokens,
+                              throughput_per_proc=throughput_per_proc,
+                              throughput_per_node=throughput_per_proc / tp *
+                              device_count,
+                              mem_per_proc=memory,
+                              mem_per_gpu=memory / tp,
+                              mem_per_node=memory / tp * device_count))
+    with open(args.dst_csv, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow([
+            'batch', 'prompt_tokens', 'completion_tokens',
+            'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
+            'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)'
+        ])
+        for re in results:
+            writer.writerow([
+                re.batch, re.prompt_tokens, re.completion_tokens,
+                f'{re.throughput_per_proc:.2f}',
+                f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
+                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
+            ])
 if __name__ == '__main__':
-    fire.Fire(main)
+    main()