Fix missed arguments when benchmark static inference performance (#787)

* minor fix in the profile scripts and docs * miss arguments * typo * fix lint * update

Fix missed arguments when benchmark static inference performance (#787)
* minor fix in the profile scripts and docs * miss arguments * typo * fix lint * update
2ba90822 · Lyu Han · GitHub · 12dc3e14 · 2ba90822 · 2ba90822
Unverified Commit 2ba90822 authored Dec 04, 2023 by Lyu Han Committed by GitHub Dec 04, 2023
4 changed files
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -20,7 +20,10 @@ from lmdeploy.turbomind import TurboMind


 def infer(model, session_id: int, input_ids: List, output_seqlen: int,
-          test_round: int, que: Queue):
+          top_k: int, top_p: float, temperature: float, test_round: int,
+          que: Queue):
+    if session_id == 1:
+        pbar = tqdm(total=test_round)
    chatbot = model.create_instance()
    stats = []
    for _ in range(test_round):
@@ -45,13 +48,18 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
                                            sequence_start=True,
                                            sequence_end=True,
                                            ignore_eos=True,
-                                            stream_output=True):
+                                            stream_output=True,
+                                            top_k=top_k,
+                                            top_p=top_p,
+                                            temperature=temperature):
            _, n_token = outputs[0]
            now = time.perf_counter()
            if n_pre_token != n_token:
                token_latency_stats[n_pre_token] = np.round(now - prev, 3)
                n_pre_token = n_token
            prev = now
+        if session_id == 1:
+            pbar.update(1)

        assert output_seqlen <= n_token <= output_seqlen + 1, \
            f'Error. session_id({session_id}) request {output_seqlen} ' \
@@ -60,11 +68,11 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
    que.put((session_id, stats))


-def warmup(model,
-           concurrency: int,
-           input_ids: List[int],
-           output_seqlen: int,
-           warmup_round: int = 2):
+def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
+           warmup_round: int):
+    if not warmup_round:
+        return
+
    print('start to warmup ...')

    def _infer(model, session_id):
@@ -75,7 +83,10 @@ def warmup(model,
                                          request_output_len=output_seqlen,
                                          sequence_start=True,
                                          sequence_end=True,
-                                          ignore_eos=True):
+                                          ignore_eos=True,
+                                          top_k=1,
+                                          top_p=1.0,
+                                          temperature=1.0):
                continue

    _start = time.perf_counter()
@@ -85,38 +96,33 @@ def warmup(model,
        procs.append(proc)
        proc.start()

-    try:
    for proc in procs:
        proc.join()
-    except Exception:
-        for proc in procs:
-            proc.stop()
-        exit(1)
+
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


-def profile_throughput(model_path: str,
-                       concurrency: int = 1,
-                       input_seqlen: int = 1,
-                       output_seqlen: int = 512,
-                       test_round: int = 10,
-                       tp: int = 1,
+def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
+                       output_seqlen: int, tp: int, top_k: int, top_p: float,
+                       temperature: float, test_round: int, warmup_round: int,
                       **kwargs):
-    # avoid turbomind checking chat template name by setting
-    # `model_name='llama'`
+
+    print(f'profiling ... concurrency: {concurrency}, '
+          f'n_prompt_token: {input_seqlen}, '
+          f'n_completion_token: {output_seqlen}, '
+          f'test_round: {test_round}, warmup_round: {warmup_round}')
+
+    # avoid turbomind checking chat template name by setting `model_name='llama'` # noqa
    tm_model = TurboMind(model_path=model_path,
                         tp=tp,
                         model_name='llama',
                         **kwargs)
-    tokenizer = tm_model.tokenizer

-    # make up a prompt that can be tokenized into {input_seqlen} tokens
+    # make up a dummy `input_ids` with the length of `input_seqlen` exactly
    assert input_seqlen > 0, 'input_seqlen should > 0'
-    input_ids = tokenizer('hi').input_ids
-    input_ids = input_ids * input_seqlen
-
-    warmup(tm_model, concurrency, input_ids, output_seqlen)
+    input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
+    warmup(tm_model, concurrency, input_ids, output_seqlen, warmup_round)

    que = Queue()
    procs = []
@@ -124,18 +130,14 @@ def profile_throughput(model_path: str,

    for i in range(concurrency):
        proc = Thread(target=infer,
-                      args=(tm_model, i + 1, input_ids, output_seqlen,
-                            test_round, que))
+                      args=(tm_model, i + 1, input_ids, output_seqlen, top_k,
+                            top_p, temperature, test_round, que))
        procs.append(proc)
        proc.start()

-    try:
    for proc in procs:
        proc.join()
-    except Exception:
-        for proc in procs:
-            proc.stop()
-        exit(1)
+
    _end = time.perf_counter()
    elapsed_time = _end - _start

@@ -323,7 +325,11 @@ def parse_args():
    parser.add_argument('--test-round',
                        type=int,
                        help='number of test rounds',
-                        default=10)
+                        default=6)
+    parser.add_argument('--warmup-round',
+                        type=int,
+                        help='number of warmuop rounds',
+                        default=1)
    args = parser.parse_args()
    return args

@@ -336,9 +342,9 @@ def main():

    os.environ['TM_LOG_LEVEL'] = args.log_level
    results: List[ProfileResult] = []
-    for batch in tqdm(args.concurrency):
-        for prompt_tokens, completion_tokens in tqdm(
-                zip(args.prompt_tokens, args.completion_tokens)):
+    for batch in args.concurrency:
+        for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
+                                                    args.completion_tokens):
            MemoryMonitor.start()
            from functools import partial
            from multiprocessing import Pool
@@ -350,7 +356,8 @@ def main():
                                     top_k=args.top_k,
                                     top_p=args.top_p,
                                     temperature=args.temperature,
-                                     test_round=args.test_round)
+                                     test_round=args.test_round,
+                                     warmup_round=args.warmup_round)
            output = Pool(1).map(profile_target, (args.model_path, ))
            model_name, first_token_latency, percentiles, \
                throughput_per_proc, tp = output[0]
@@ -370,14 +377,15 @@ def main():
                              mem_per_proc=memory,
                              mem_per_gpu=memory / tp,
                              mem_per_node=memory / tp * device_count))
+    if args.csv:
        with open(args.csv, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([
                'batch', 'prompt_tokens', 'completion_tokens',
                '1st_token_latency(min)(s)', '1st_token_latency(max)(s)',
-            '1st_token_latency(ave)(s)', 'percentile50(s)', 'percentile75(s)',
-            'percentile95(s)', 'percentile99(s)', 'throughput(token/s)',
-            'mem_per_proc(GB)', 'mem_per_gpu(GB)'
+                '1st_token_latency(ave)(s)', 'percentile50(s)',
+                'percentile75(s)', 'percentile95(s)', 'percentile99(s)',
+                'throughput(token/s)', 'mem_per_proc(GB)', 'mem_per_gpu(GB)'
            ])
            for re in results:
                writer.writerow([

--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -77,11 +77,13 @@ class Engine:
        stats = []
        for prompt, input_seqlen, output_seqlen in iter(
                req_queue.get, [None, None, None]):
-            input_ids = self.tokenizer(prompt).input_ids
            offset = 0
            timestamps = []
            tokens = []
+
            timestamps.append(time.perf_counter())
+
+            input_ids = self.tokenizer(prompt).input_ids
            for outputs in model_inst.stream_infer(
                    session_id,
                    input_ids=input_ids,

--- a/docs/en/benchmark/profile_throughput.md
+++ b/docs/en/benchmark/profile_throughput.md
@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # get internlm-7b from huggingface and convert it to turbomind format
 lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b

-python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
+python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
 ```

 ## Command details

--- a/docs/zh_cn/benchmark/profile_throughput.md
+++ b/docs/zh_cn/benchmark/profile_throughput.md
@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b

 # 执行测速脚本
-python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
+python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
 ```

 ## 测试方法