Report the inference benchmark of models with different size (#794)

* update test scripts for models with different sizes * update * only test after tunning gemm * chmod +x * fix typo * benchmark on a100 * fix typo * fix typo * per-token latency percentile in profile_throughput * fix * fix * rename * make the script accept parameters * minor fix * indent * reformat table * change to 3000 * minor fix

Report the inference benchmark of models with different size (#794)
* update test scripts for models with different sizes * update * only test after tunning gemm * chmod +x * fix typo * benchmark on a100 * fix typo * fix typo * per-token latency percentile in profile_throughput * fix * fix * rename * make the script accept parameters * minor fix * indent * reformat table * change to 3000 * minor fix
ebe90bc9 · Lyu Han · GitHub · 5b9e454a · ebe90bc9 · ebe90bc9
Unverified Commit ebe90bc9 authored Dec 06, 2023 by Lyu Han Committed by GitHub Dec 06, 2023
9 changed files
--- a/benchmark/benchmark_13b.sh
+++ b/benchmark/benchmark_13b.sh
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-13b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 500
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
--- a/benchmark/benchmark_20b.sh
+++ b/benchmark/benchmark_20b.sh
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of internlm-20b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=2
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 700
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+cp gemm_config.in ${output_path}
--- a/benchmark/benchmark_70b.sh
+++ b/benchmark/benchmark_70b.sh
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-70b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=4
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 4000
+crudini --set ${config_path} llama max_batch_size 256
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128 256)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 64 128 256 \
+        --csv ${output_path}/generation.csv
+}
+
+output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation  ${output_path}
--- a/benchmark/benchmark_7b.sh
+++ b/benchmark/benchmark_7b.sh
+#!/bin/bash
+if [ -z "$1" ]
+then
+    echo "Error. Please input the model path of llama2-7b model"
+    exit 1
+fi
+
+workspace_dir=$(dirname $(realpath "$0"))
+
+tp=1
+model_path="$1"
+model_foldername=$(basename "$model_path")
+turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
+
+# convert
+lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
+if [ $? != 0 ]
+then
+exit 1
+fi
+
+# update recommended config to config.ini
+config_path=${turbomind_model_path}/triton_models/weights/config.ini
+
+apt-get update
+apt-get install crudini -y
+
+crudini --set ${config_path} llama max_context_token_num 4
+crudini --set ${config_path} llama cache_chunk_size -1
+crudini --set ${config_path} llama cache_max_entry_count 1000
+crudini --set ${config_path} llama max_batch_size 128
+# end of update config
+
+cd ${workspace_dir}
+
+# download dataset
+wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+benchmark_rpm () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    batches=(64 128)
+    for batch in "${batches[@]}"
+    do
+        for i in {1..3}
+        do
+        python3 profile_throughput.py \
+            ShareGPT_V3_unfiltered_cleaned_split.json \
+            ${turbomind_model_path} \
+            --concurrency "$batch" \
+            --num_prompts 3000 \
+            --csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
+        done
+    done
+}
+
+benchmark_generation () {
+    output_path=$1
+    mkdir -p "${output_path}"
+
+    python3 profile_generation.py \
+        ${turbomind_model_path} \
+        --concurrency 1 16 32 64 \
+        --csv ${output_path}/generation.csv
+}
+
+################################# BENCHMARK AFTER TUNING GEMM #################################
+output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
+
+# tune gemm
+head_num=$(crudini --get "${config_path}" llama head_num)
+size_per_head=$(crudini --get "${config_path}" llama size_per_head)
+vocab_size=$(crudini --get "${config_path}" llama vocab_size)
+inter_size=$(crudini --get "${config_path}" llama inter_size)
+tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
+max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
+
+echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
+
+python3 -m lmdeploy.turbomind.generate_gemm_config \
+    --head_num ${head_num} \
+    --size_per_head ${size_per_head} \
+    --vocab_size ${vocab_size} \
+    --inter_size ${inter_size} \
+    --tensor_para_size ${tensor_para_size} \
+    --max_batch_size ${max_batch_size}
+
+# benchmark request throughput and static inference
+benchmark_rpm ${output_path}
+benchmark_generation ${output_path}
+
+mv gemm_config.in ${output_path}
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
    for _ in range(test_round):
        token_latency_stats = [0] * (output_seqlen + 1)
        prev = time.perf_counter()
-        n_pre_token = 0
+        n_prev_token = 0
        """
        The iterator provided by `stream_infer` denotes the number of generated tokens so far,
        which is represented by the variable `n_token`.
        Please note that `n_token` is not a continuous value. In other words, during the iteration,
        its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
        So, it is quite difficult to get the latency of each generated token.
-        As a work-around, we set the latency `new-prev` of each iteration to the first token of
+        As a work-around, we set the latency `now-prev` of each iteration to the first token of
        the new generated tokens, and leave the latency of the rest tokens being 0.
        For example, in the first iteration, 5 tokens are generated.
        The time elapsing in this iteration `now-prev` is set to the latency of first token of
@@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
                                            temperature=temperature):
            _, n_token = outputs[0]
            now = time.perf_counter()
-            if n_pre_token != n_token:
-                token_latency_stats[n_pre_token] = np.round(now - prev, 3)
-                n_pre_token = n_token
+            if n_prev_token != n_token:
+                token_latency_stats[n_prev_token] = np.round(now - prev, 3)
+                n_prev_token = n_token
            prev = now
        if session_id == 1:
            pbar.update(1)

--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -158,8 +158,8 @@ class Engine:
        prompt_tokens = total_tokens - completion_tokens
        completion_token_throughput = completion_tokens / elapsed_time
        total_token_throughput = total_tokens / elapsed_time
-        rqs = len(requests) / elapsed_time
-        rqm = rqs * 60
+        rps = len(requests) / elapsed_time
+        rpm = rps * 60

        if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
            print(f'Did not generate requested number of tokens. '
@@ -178,8 +178,8 @@ class Engine:
            f'number of completion tokens: {completion_tokens:.0f}\n'
            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rqs:.3f} req/s\n'
-            f'RPM (request per minute): {rqm:.3f} req/min\n'
+            f'RPS (request per second): {rps:.3f} req/s\n'
+            f'RPM (request per minute): {rpm:.3f} req/min\n'
            f'{"-" * 50}\n')

        if self.csv:
@@ -190,7 +190,7 @@ class Engine:
                    'completion_tokens', '1st_token_latency(min)(s)',
                    '1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
                    'output token thr(tokens/s', 'total token thr(token/s)',
-                    'RPM'
+                    'RPS', 'RPM'
                ])
                writer.writerow([
                    concurrency,
@@ -199,7 +199,7 @@ class Engine:
                    f'{first_token_latency_max:.3f}' if stream_output else '-',
                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}', f'{rqm:.3f}'
+                    f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
                ])



--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
@@ -163,8 +163,8 @@ class Engine:
        prompt_tokens = total_tokens - completion_tokens
        completion_token_throughput = completion_tokens / elapsed_time
        total_token_throughput = total_tokens / elapsed_time
-        rqs = len(requests) / elapsed_time
-        rqm = rqs * 60
+        rps = len(requests) / elapsed_time
+        rpm = rps * 60

        if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
            print(f'Did not generate requested number of tokens. '
@@ -183,8 +183,8 @@ class Engine:
            f'number of completion tokens: {completion_tokens:.0f}\n'
            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rqs:.3f} req/s\n'
-            f'RPM (request per minute): {rqm:.3f} req/min\n'
+            f'RPS (request per second): {rps:.3f} req/s\n'
+            f'RPM (request per minute): {rpm:.3f} req/min\n'
            f'{"-" * 50}\n')

        if self.csv:
@@ -195,7 +195,7 @@ class Engine:
                    'completion_tokens', '1st_token_latency(min)(s)',
                    '1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
                    'output token thr(tokens/s', 'total token thr(token/s)',
-                    'RPM'
+                    'RPS', 'RPM'
                ])
                writer.writerow([
                    concurrency,
@@ -204,7 +204,7 @@ class Engine:
                    f'{first_token_latency_max:.3f}' if stream_output else '-',
                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}', f'{rqm:.3f}'
+                    f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
                ])



--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -75,13 +75,14 @@ class Engine:
                   stream_output: bool):
        model_inst = self.tm_model.create_instance()
        stats = []
+        # get each generated token's latency
+        per_token_latency_stats = []
        for prompt, input_seqlen, output_seqlen in iter(
                req_queue.get, [None, None, None]):
+            _per_token_latency_stats = [0] * (output_seqlen + 1)
            offset = 0
-            timestamps = []
-            tokens = []
-
-            timestamps.append(time.perf_counter())
+            prev = time.perf_counter()
+            n_prev_token = 0

            input_ids = self.tokenizer(prompt).input_ids
            for outputs in model_inst.stream_infer(
@@ -94,25 +95,32 @@ class Engine:
                    sequence_end=True,
                    ignore_eos=True,
                    stream_output=stream_output):
-                res, token = outputs[0]
+                res, n_token = outputs[0]
                self.tokenizer.decode(res, offset)
-                offset = token
-                timestamps.append(time.perf_counter())
-                tokens.append(token)
-            first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
-            token_latency = np.round(timestamps[-1] - timestamps[0], 3)
-            completion_tokens = tokens[-1]
-            assert output_seqlen <= completion_tokens <= output_seqlen + 1, \
+                offset = n_token
+                now = time.perf_counter()
+                if n_prev_token != n_token:
+                    _per_token_latency_stats[n_prev_token] = np.round(
+                        now - prev, 3)
+                    n_prev_token = n_token
+                prev = now
+
+            assert output_seqlen <= n_token <= output_seqlen + 1, \
                f'Error. session_id({session_id}) request {output_seqlen} ' \
-                f'tokens, but generate {completion_tokens} tokens.\n' \
+                f'tokens, but generate {n_token} tokens.\n' \
                f'prompt: {prompt}'
-            total_tokens = tokens[-1] + input_seqlen
+
+            first_token_latency = _per_token_latency_stats[0]
+            completion_tokens = n_token
+            total_tokens = n_token + input_seqlen
            stats.append([
                first_token_latency, completion_tokens, output_seqlen,
-                total_tokens, token_latency
+                total_tokens
            ])
+            # skip the first token latency
+            per_token_latency_stats.append(_per_token_latency_stats[1:])
            self.pbar.update(1)
-        res_queue.put((session_id, stats))
+        res_queue.put((session_id, stats, per_token_latency_stats))

    def process_request(self,
                        requests,
@@ -146,13 +154,15 @@ class Engine:
        elapsed_time = time.time() - start

        stats = []
+        per_token_latency_stats = []
        while not res_queue.empty():
-            session_id, _stats = res_queue.get()
-            # print(f'\n{"-" * 50}\n'
-            #       f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
+            session_id, _stats, _per_token_latency_stats = res_queue.get()
            stats.append(np.array(_stats))
-
-        stats = np.concatenate(stats).reshape(-1, 5)
+            per_token_latency_stats += [
+                item for sublist in _per_token_latency_stats
+                for item in sublist
+            ]
+        stats = np.concatenate(stats).reshape(-1, 4)

        first_token_latency_min = np.min(stats[:, 0], axis=0)
        first_token_latency_max = np.max(stats[:, 0], axis=0)
@@ -162,23 +172,33 @@ class Engine:
        prompt_tokens = total_tokens - completion_tokens
        completion_token_throughput = completion_tokens / elapsed_time
        total_token_throughput = total_tokens / elapsed_time
-        rqs = len(requests) / elapsed_time
-        rqm = rqs * 60
+        rps = len(requests) / elapsed_time
+        rpm = rps * 60
+
+        per_token_latency_stats.sort()
+        percentiles = [
+            np.round(
+                per_token_latency_stats[int(percent *
+                                            len(per_token_latency_stats))], 3)
+            for percent in [0.5, 0.75, 0.95, 0.99]
+        ]

        print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
              f'elapsed_time: {elapsed_time:.3f}s\n')
        if stream_output:
-            print(f'first_token latency(min, max, ave): '
-                  f'{first_token_latency_min:.3f}s, '
-                  f'{first_token_latency_max:.3f}s, '
-                  f'{first_token_latency_ave:.3f}s\n')
+            print(f'first token latency(s)(min, max, ave): '
+                  f'{first_token_latency_min:.3f}, '
+                  f'{first_token_latency_max:.3f}, '
+                  f'{first_token_latency_ave:.3f}')
+            print(f'per-token latency(s) percentile(50, 75, 95, 99): '
+                  f'{percentiles}\n')
        print(
            f'number of prompt tokens: {prompt_tokens:.0f}\n'
            f'number of completion tokens: {completion_tokens:.0f}\n'
            f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
            f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
-            f'RPS (request per second): {rqs:.3f} req/s\n'
-            f'RPM (request per minute): {rqm:.3f} req/min\n'
+            f'RPS (request per second): {rps:.3f} req/s\n'
+            f'RPM (request per minute): {rpm:.3f} req/min\n'
            f'{"-" * 50}\n')

        if self.csv:
@@ -188,8 +208,9 @@ class Engine:
                    'batch', 'num_promts', 'prompt_tokens',
                    'completion_tokens', '1st_token_latency(min)(s)',
                    '1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
-                    'output token thr(tokens/s', 'total token thr(token/s)',
-                    'RPM'
+                    'percentile50(s)', 'percentile75(s)', 'percentile95(s)',
+                    'percentile99(s)', 'output token thr(tokens/s)',
+                    'total token thr(token/s)', 'RPS', 'RPM'
                ])
                writer.writerow([
                    concurrency,
@@ -197,8 +218,12 @@ class Engine:
                    f'{first_token_latency_min:.3f}' if stream_output else '-',
                    f'{first_token_latency_max:.3f}' if stream_output else '-',
                    f'{first_token_latency_ave:.3f}' if stream_output else '-',
+                    f'{percentiles[0]:.3f}' if stream_output else '-',
+                    f'{percentiles[1]:.3f}' if stream_output else '-',
+                    f'{percentiles[2]:.3f}' if stream_output else '-',
+                    f'{percentiles[3]:.3f}' if stream_output else '-',
                    f'{completion_token_throughput:.3f}',
-                    f'{total_token_throughput:.3f}', f'{rqm:.3f}'
+                    f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
                ])



--- a/docs/en/benchmark/a100_fp16.md
+++ b/docs/en/benchmark/a100_fp16.md
+# Benchmark on A100 (FP16)
+
+All the following results are tested on (x8) A100-80G CUDA 11.8.
+
+The tested lmdeploy version is `v0.1.0a1`.
+
+The commands provided below facilitate benchmarking both [static inference performance](#static-inference-benchmark) and [request throughput](#request-throughput-benchmark) on an A100-80G(x8) for models of various sizes.
+
+```shell
+bash benchmark/benchmark_7b.sh <the/path/of/llama2-7b/model>
+bash benchmark/benchmark_13b.sh <the/path/of/llama2-13b/model>
+bash benchmark/benchmark_20b.sh <the/path/of/internlm-20b/model>
+bash benchmark/benchmark_70b.sh <the/path/of/llama2-70b/model>
+```
+
+## Static Inference Benchmark
+
+FTL: **F**irst **T**oken **L**atency
+
+### llama2-7b
+
+| batch | tp  | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
+| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
+| 1     | 1   | 1             | 128           | 100.02                | 76.55   | 0.011       | 0.01        | 0.011       | 0.009  | 0.009  | 0.01   | 0.011  |
+| 1     | 1   | 128           | 128           | 102.21                | 76.59   | 0.022       | 0.022       | 0.022       | 0.01   | 0.01   | 0.01   | 0.01   |
+| 1     | 1   | 128           | 2048          | 98.92                 | 76.59   | 0.022       | 0.022       | 0.022       | 0.01   | 0.01   | 0.01   | 0.01   |
+| 1     | 1   | 2048          | 128           | 86.1                  | 76.77   | 0.139       | 0.139       | 0.14        | 0.01   | 0.01   | 0.01   | 0.011  |
+| 1     | 1   | 2048          | 2048          | 93.78                 | 76.77   | 0.14        | 0.139       | 0.141       | 0.011  | 0.011  | 0.011  | 0.011  |
+| 16    | 1   | 1             | 128           | 1504.72               | 76.59   | 0.021       | 0.011       | 0.031       | 0.01   | 0.011  | 0.011  | 0.013  |
+| 16    | 1   | 128           | 128           | 1272.47               | 76.77   | 0.129       | 0.023       | 0.149       | 0.011  | 0.011  | 0.012  | 0.014  |
+| 16    | 1   | 128           | 2048          | 1010.62               | 76.77   | 0.13        | 0.023       | 0.144       | 0.015  | 0.018  | 0.02   | 0.021  |
+| 16    | 1   | 2048          | 128           | 348.87                | 78.3    | 2.897       | 0.143       | 3.576       | 0.02   | 0.021  | 0.022  | 0.025  |
+| 16    | 1   | 2048          | 2048          | 601.63                | 78.3    | 2.678       | 0.142       | 3.084       | 0.025  | 0.028  | 0.03   | 0.031  |
+| 32    | 1   | 1             | 128           | 2136.73               | 76.62   | 0.079       | 0.014       | 0.725       | 0.011  | 0.012  | 0.013  | 0.021  |
+| 32    | 1   | 128           | 128           | 2125.47               | 76.99   | 0.214       | 0.022       | 0.359       | 0.012  | 0.013  | 0.014  | 0.035  |
+| 32    | 1   | 128           | 2048          | 1462.12               | 76.99   | 0.2         | 0.026       | 0.269       | 0.021  | 0.026  | 0.031  | 0.033  |
+| 32    | 1   | 2048          | 128           | 450.43                | 78.3    | 4.288       | 0.143       | 5.267       | 0.031  | 0.032  | 0.034  | 0.161  |
+| 32    | 1   | 2048          | 2048          | 733.34                | 78.34   | 4.118       | 0.19        | 5.429       | 0.04   | 0.045  | 0.05   | 0.053  |
+| 64    | 1   | 1             | 128           | 4154.81               | 76.71   | 0.042       | 0.013       | 0.21        | 0.012  | 0.018  | 0.028  | 0.041  |
+| 64    | 1   | 128           | 128           | 3024.07               | 77.43   | 0.44        | 0.026       | 1.061       | 0.014  | 0.018  | 0.026  | 0.158  |
+| 64    | 1   | 128           | 2048          | 1852.06               | 77.96   | 0.535       | 0.027       | 1.231       | 0.03   | 0.041  | 0.048  | 0.053  |
+| 64    | 1   | 2048          | 128           | 493.46                | 78.4    | 6.59        | 0.142       | 16.235      | 0.046  | 0.049  | 0.055  | 0.767  |
+| 64    | 1   | 2048          | 2048          | 755.65                | 78.4    | 39.105      | 0.142       | 116.285     | 0.047  | 0.049  | 0.051  | 0.207  |
+
+### llama2-13b
+
+| batch | tp  | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
+| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
+| 1     | 1   | 1             | 128           | 57.49                 | 74.84   | 0.018       | 0.018       | 0.019       | 0.017  | 0.017  | 0.017  | 0.017  |
+| 1     | 1   | 128           | 128           | 56.58                 | 74.84   | 0.04        | 0.039       | 0.04        | 0.017  | 0.017  | 0.017  | 0.018  |
+| 1     | 1   | 128           | 2048          | 55.29                 | 74.84   | 0.04        | 0.04        | 0.04        | 0.018  | 0.018  | 0.018  | 0.019  |
+| 1     | 1   | 2048          | 128           | 48.99                 | 75.09   | 0.242       | 0.242       | 0.243       | 0.019  | 0.019  | 0.019  | 0.019  |
+| 1     | 1   | 2048          | 2048          | 52.12                 | 75.09   | 0.243       | 0.24        | 0.244       | 0.019  | 0.019  | 0.019  | 0.02   |
+| 16    | 1   | 1             | 128           | 869.45                | 74.87   | 0.036       | 0.019       | 0.053       | 0.018  | 0.019  | 0.019  | 0.02   |
+| 16    | 1   | 128           | 128           | 757.3                 | 75.09   | 0.252       | 0.041       | 0.272       | 0.019  | 0.02   | 0.02   | 0.021  |
+| 16    | 1   | 128           | 2048          | 605.88                | 75.09   | 0.253       | 0.041       | 0.275       | 0.026  | 0.03   | 0.033  | 0.034  |
+| 16    | 1   | 2048          | 128           | 257.92                | 76.96   | 3.442       | 0.245       | 3.668       | 0.033  | 0.034  | 0.035  | 0.035  |
+| 16    | 1   | 2048          | 2048          | 366.67                | 76.99   | 3.122       | 0.249       | 3.671       | 0.04   | 0.044  | 0.047  | 0.047  |
+| 32    | 1   | 1             | 128           | 1667.5                | 74.9    | 0.034       | 0.021       | 0.057       | 0.019  | 0.02   | 0.021  | 0.023  |
+| 32    | 1   | 128           | 128           | 1301.27               | 75.37   | 0.461       | 0.04        | 0.497       | 0.021  | 0.022  | 0.023  | 0.025  |
+| 32    | 1   | 128           | 2048          | 860.14                | 75.84   | 0.833       | 0.041       | 1.151       | 0.034  | 0.042  | 0.047  | 0.048  |
+| 32    | 1   | 2048          | 128           | 291.54                | 77.02   | 5.315       | 0.245       | 13.483      | 0.046  | 0.047  | 0.049  | 0.51   |
+| 32    | 1   | 2048          | 2048          | 389.64                | 77.02   | 38.725      | 0.245       | 108.104     | 0.047  | 0.047  | 0.049  | 0.05   |
+| 64    | 1   | 1             | 128           | 3049.16               | 74.96   | 0.044       | 0.025       | 0.073       | 0.02   | 0.022  | 0.026  | 0.029  |
+| 64    | 1   | 128           | 128           | 2033.22               | 75.87   | 0.703       | 0.046       | 0.951       | 0.024  | 0.026  | 0.029  | 0.032  |
+| 64    | 1   | 128           | 2048          | 998.86                | 76.9    | 7.805       | 0.042       | 60.1        | 0.045  | 0.047  | 0.05   | 0.063  |
+| 64    | 1   | 2048          | 128           | 286.32                | 76.99   | 19.69       | 0.245       | 32.394      | 0.047  | 0.048  | 0.05   | 0.27   |
+| 64    | 1   | 2048          | 2048          | 387.86                | 77.09   | 190.453     | 0.245       | 307.331     | 0.047  | 0.048  | 0.049  | 0.05   |
+
+### internlm-20b
+
+| batch | tp  | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
+| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
+| 1     | 2   | 1             | 128           | 61.14                 | 73.55   | 0.018       | 0.017       | 0.019       | 0.016  | 0.016  | 0.016  | 0.018  |
+| 1     | 2   | 128           | 128           | 60.03                 | 73.55   | 0.042       | 0.041       | 0.043       | 0.016  | 0.016  | 0.016  | 0.017  |
+| 1     | 2   | 128           | 2048          | 58.26                 | 73.55   | 0.042       | 0.042       | 0.043       | 0.017  | 0.017  | 0.018  | 0.018  |
+| 1     | 2   | 2048          | 128           | 51.93                 | 73.68   | 0.217       | 0.216       | 0.217       | 0.018  | 0.018  | 0.018  | 0.018  |
+| 1     | 2   | 2048          | 2048          | 56.36                 | 73.68   | 0.217       | 0.217       | 0.217       | 0.018  | 0.018  | 0.018  | 0.018  |
+| 16    | 2   | 1             | 128           | 903.01                | 73.65   | 0.034       | 0.018       | 0.051       | 0.017  | 0.018  | 0.019  | 0.02   |
+| 16    | 2   | 128           | 128           | 794.13                | 73.74   | 0.227       | 0.043       | 0.248       | 0.018  | 0.019  | 0.02   | 0.021  |
+| 16    | 2   | 128           | 2048          | 669.87                | 73.74   | 0.227       | 0.043       | 0.25        | 0.024  | 0.027  | 0.029  | 0.03   |
+| 16    | 2   | 2048          | 128           | 288.60                | 75.60   | 3.09        | 0.247       | 4.485       | 0.029  | 0.03   | 0.031  | 0.032  |
+| 16    | 2   | 2048          | 2048          | 441.46                | 75.61   | 3.172       | 0.219       | 4.442       | 0.035  | 0.037  | 0.04   | 0.041  |
+| 32    | 2   | 1             | 128           | 1673.64               | 73.71   | 0.037       | 0.02        | 0.066       | 0.019  | 0.02   | 0.021  | 0.023  |
+| 32    | 2   | 128           | 128           | 1347.57               | 73.90   | 0.351       | 0.043       | 0.436       | 0.02   | 0.021  | 0.023  | 0.025  |
+| 32    | 2   | 128           | 2048          | 1025.62               | 73.90   | 0.391       | 0.042       | 0.441       | 0.031  | 0.037  | 0.041  | 0.043  |
+| 32    | 2   | 2048          | 128           | 352.45                | 75.74   | 6.062       | 0.218       | 6.3         | 0.042  | 0.043  | 0.045  | 0.046  |
+| 32    | 2   | 2048          | 2048          | 514.60                | 75.77   | 10.36       | 0.222       | 70.328      | 0.049  | 0.05   | 0.051  | 0.053  |
+| 64    | 2   | 1             | 128           | 2954.34               | 73.82   | 0.05        | 0.029       | 0.074       | 0.021  | 0.023  | 0.026  | 0.03   |
+| 64    | 2   | 128           | 128           | 2122.92               | 74.24   | 0.591       | 0.047       | 0.808       | 0.024  | 0.026  | 0.029  | 0.032  |
+| 64    | 2   | 128           | 2048          | 1276.61               | 75.18   | 2.529       | 0.049       | 41.212      | 0.042  | 0.048  | 0.052  | 0.055  |
+| 64    | 2   | 2048          | 128           | 350.82                | 75.88   | 12.382      | 0.219       | 20.986      | 0.05   | 0.051  | 0.054  | 0.249  |
+| 64    | 2   | 2048          | 2048          | 512.37                | 76.26   | 111.149     | 0.221       | 211.531     | 0.05   | 0.051  | 0.052  | 0.055  |
+
+### llama2-70b
+
+| batch | tp  | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
+| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
+| 1     | 4   | 1             | 128           | 33.94                 | 73.72   | 0.031       | 0.03        | 0.031       | 0.029  | 0.029  | 0.029  | 0.03   |
+| 1     | 4   | 128           | 128           | 33.63                 | 73.72   | 0.074       | 0.073       | 0.074       | 0.029  | 0.029  | 0.029  | 0.03   |
+| 1     | 4   | 128           | 2048          | 32.38                 | 73.72   | 0.074       | 0.074       | 0.075       | 0.031  | 0.031  | 0.031  | 0.031  |
+| 1     | 4   | 2048          | 128           | 28.32                 | 73.78   | 0.402       | 0.401       | 0.403       | 0.031  | 0.031  | 0.031  | 0.051  |
+| 1     | 4   | 2048          | 2048          | 31.9                  | 73.78   | 0.405       | 0.402       | 0.407       | 0.031  | 0.031  | 0.031  | 0.031  |
+| 16    | 4   | 1             | 128           | 468.52                | 73.72   | 0.071       | 0.034       | 0.939       | 0.03   | 0.031  | 0.032  | 0.251  |
+| 16    | 4   | 128           | 128           | 439.77                | 73.81   | 0.437       | 0.08        | 0.687       | 0.03   | 0.031  | 0.032  | 0.207  |
+| 16    | 4   | 128           | 2048          | 482.99                | 73.81   | 0.403       | 0.079       | 0.44        | 0.033  | 0.033  | 0.035  | 0.036  |
+| 16    | 4   | 2048          | 128           | 189.34                | 73.98   | 5.776       | 0.437       | 7.612       | 0.035  | 0.036  | 0.036  | 0.037  |
+| 16    | 4   | 2048          | 2048          | 399.42                | 73.98   | 5.773       | 0.411       | 6.844       | 0.036  | 0.037  | 0.038  | 0.041  |
+| 32    | 4   | 1             | 128           | 906.03                | 73.75   | 0.098       | 0.043       | 0.253       | 0.032  | 0.033  | 0.035  | 0.178  |
+| 32    | 4   | 128           | 128           | 746.36                | 73.91   | 0.749       | 0.078       | 1.026       | 0.032  | 0.033  | 0.035  | 0.438  |
+| 32    | 4   | 128           | 2048          | 853.56                | 73.91   | 0.732       | 0.076       | 1.129       | 0.036  | 0.038  | 0.041  | 0.158  |
+| 32    | 4   | 2048          | 128           | 232.6                 | 73.99   | 11.834      | 0.408       | 13.321      | 0.04   | 0.041  | 0.043  | 0.248  |
+| 32    | 4   | 2048          | 2048          | 636.23                | 73.99   | 11.711      | 0.409       | 12.689      | 0.043  | 0.045  | 0.048  | 0.179  |
+| 64    | 4   | 1             | 128           | 1425.79               | 73.81   | 0.213       | 0.046       | 1.264       | 0.037  | 0.039  | 0.044  | 0.329  |
+| 64    | 4   | 128           | 128           | 1159.84               | 73.96   | 1.292       | 0.107       | 2.676       | 0.037  | 0.04   | 0.045  | 0.378  |
+| 64    | 4   | 128           | 2048          | 1391.8                | 73.95   | 1.173       | 0.135       | 1.623       | 0.043  | 0.047  | 0.052  | 0.251  |
+| 64    | 4   | 2048          | 128           | 270.47                | 74.02   | 17.402      | 0.452       | 24.164      | 0.05   | 0.052  | 0.057  | 0.345  |
+| 64    | 4   | 2048          | 2048          | 930.46                | 74.01   | 21.29       | 0.423       | 24.498      | 0.055  | 0.059  | 0.065  | 0.299  |
+
+## Request Throughput Benchmark
+
+FTL: **F**irst **T**oken **L**atency
+
+| model        | batch | tp  | num_prompts | PRS    | PRM     | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | throughput(out tok/s) | throughput(total tok/s) |
+| ------------ | ----- | --- | ----------- | ------ | ------- | ----------- | ----------- | ----------- | --------------------- | ----------------------- |
+| llama2-7b    | 64    | 1   | 3000        | 10.275 | 616.477 | 0.092       | 0.036       | 1.145       | 2562.435              | 5283.547                |
+|              | 128   | 1   | 3000        | 12.611 | 756.677 | 0.205       | 0.056       | 2.241       | 3210.281              | 6619.357                |
+| llama2-13b   | 64    | 1   | 3000        | 6.337  | 380.244 | 0.159       | 0.051       | 2.048       | 1474.786              | 3039.398                |
+|              | 128   | 1   | 3000        | 7.588  | 455.273 | 0.412       | 0.085       | 4.445       | 1765.788              | 3639.128                |
+| internlm-20b | 64    | 2   | 3000        | 7.842  | 470.516 | 0.166       | 0.059       | 2.461       | 1564.696              | 3311.16                 |
+|              | 128   | 2   | 3000        | 9.776  | 586.568 | 0.34        | 0.079       | 5.808       | 1950.627              | 4127.855                |
+| llama2-70b   | 64    | 4   | 3000        | 4.285  | 257.08  | 0.301       | 0.083       | 4.689       | 1000.376              | 2062.7                  |
+|              | 128   | 4   | 3000        | 5.833  | 349.996 | 0.633       | 0.107       | 8.431       | 1361.939              | 2808.216                |
+|              | 256   | 4   | 3000        | 6.568  | 394.108 | 1.49        | 0.171       | 19.52       | 1533.592              | 3162.15                 |