"vscode:/vscode.git/clone" did not exist on "55b84eea8245f14c4f16d7f6b5532f93b872b393"
Unverified Commit ebe90bc9 authored by Lyu Han's avatar Lyu Han Committed by GitHub
Browse files

Report the inference benchmark of models with different size (#794)

* update test scripts for models with different sizes

* update

* only test after tunning gemm

* chmod +x

* fix typo

* benchmark on a100

* fix typo

* fix typo

* per-token latency percentile in profile_throughput

* fix

* fix

* rename

* make the script accept parameters

* minor fix

* indent

* reformat table

* change to 3000

* minor fix
parent 5b9e454a
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-13b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 500
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
mv gemm_config.in ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of internlm-20b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=2
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert internlm-20b ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 700
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
cp gemm_config.in ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-70b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=4
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 4000
crudini --set ${config_path} llama max_batch_size 256
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128 256)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 64 128 256 \
--csv ${output_path}/generation.csv
}
output_path="${workspace_dir}"/output/"${model_foldername}"-tp"${tp}"
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
#!/bin/bash
if [ -z "$1" ]
then
echo "Error. Please input the model path of llama2-7b model"
exit 1
fi
workspace_dir=$(dirname $(realpath "$0"))
tp=1
model_path="$1"
model_foldername=$(basename "$model_path")
turbomind_model_path="${workspace_dir}"/workspace/"${model_foldername}"
# convert
lmdeploy convert llama2 ${model_path} --dst-path ${turbomind_model_path} --tp ${tp}
if [ $? != 0 ]
then
exit 1
fi
# update recommended config to config.ini
config_path=${turbomind_model_path}/triton_models/weights/config.ini
apt-get update
apt-get install crudini -y
crudini --set ${config_path} llama max_context_token_num 4
crudini --set ${config_path} llama cache_chunk_size -1
crudini --set ${config_path} llama cache_max_entry_count 1000
crudini --set ${config_path} llama max_batch_size 128
# end of update config
cd ${workspace_dir}
# download dataset
wget -O ShareGPT_V3_unfiltered_cleaned_split.json https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
benchmark_rpm () {
output_path=$1
mkdir -p "${output_path}"
batches=(64 128)
for batch in "${batches[@]}"
do
for i in {1..3}
do
python3 profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
${turbomind_model_path} \
--concurrency "$batch" \
--num_prompts 3000 \
--csv ${output_path}/rpm_localhost_batch_"${batch}"_"${i}"th.csv
done
done
}
benchmark_generation () {
output_path=$1
mkdir -p "${output_path}"
python3 profile_generation.py \
${turbomind_model_path} \
--concurrency 1 16 32 64 \
--csv ${output_path}/generation.csv
}
################################# BENCHMARK AFTER TUNING GEMM #################################
output_path="${workspace_dir}"/output/"${model_foldername}"-tunned-gemm-tp"${tp}"
# tune gemm
head_num=$(crudini --get "${config_path}" llama head_num)
size_per_head=$(crudini --get "${config_path}" llama size_per_head)
vocab_size=$(crudini --get "${config_path}" llama vocab_size)
inter_size=$(crudini --get "${config_path}" llama inter_size)
tensor_para_size=$(crudini --get "${config_path}" llama tensor_para_size)
max_batch_size=$(crudini --get "${config_path}" llama max_batch_size)
echo $head_num, $size_per_head, $vocab_size, $inter_size, $tensor_para_size, $max_batch_size
python3 -m lmdeploy.turbomind.generate_gemm_config \
--head_num ${head_num} \
--size_per_head ${size_per_head} \
--vocab_size ${vocab_size} \
--inter_size ${inter_size} \
--tensor_para_size ${tensor_para_size} \
--max_batch_size ${max_batch_size}
# benchmark request throughput and static inference
benchmark_rpm ${output_path}
benchmark_generation ${output_path}
mv gemm_config.in ${output_path}
......@@ -29,14 +29,14 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
for _ in range(test_round):
token_latency_stats = [0] * (output_seqlen + 1)
prev = time.perf_counter()
n_pre_token = 0
n_prev_token = 0
"""
The iterator provided by `stream_infer` denotes the number of generated tokens so far,
which is represented by the variable `n_token`.
Please note that `n_token` is not a continuous value. In other words, during the iteration,
its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc.
So, it is quite difficult to get the latency of each generated token.
As a work-around, we set the latency `new-prev` of each iteration to the first token of
As a work-around, we set the latency `now-prev` of each iteration to the first token of
the new generated tokens, and leave the latency of the rest tokens being 0.
For example, in the first iteration, 5 tokens are generated.
The time elapsing in this iteration `now-prev` is set to the latency of first token of
......@@ -54,9 +54,9 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
temperature=temperature):
_, n_token = outputs[0]
now = time.perf_counter()
if n_pre_token != n_token:
token_latency_stats[n_pre_token] = np.round(now - prev, 3)
n_pre_token = n_token
if n_prev_token != n_token:
token_latency_stats[n_prev_token] = np.round(now - prev, 3)
n_prev_token = n_token
prev = now
if session_id == 1:
pbar.update(1)
......
......@@ -158,8 +158,8 @@ class Engine:
prompt_tokens = total_tokens - completion_tokens
completion_token_throughput = completion_tokens / elapsed_time
total_token_throughput = total_tokens / elapsed_time
rqs = len(requests) / elapsed_time
rqm = rqs * 60
rps = len(requests) / elapsed_time
rpm = rps * 60
if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
print(f'Did not generate requested number of tokens. '
......@@ -178,8 +178,8 @@ class Engine:
f'number of completion tokens: {completion_tokens:.0f}\n'
f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
f'RPS (request per second): {rqs:.3f} req/s\n'
f'RPM (request per minute): {rqm:.3f} req/min\n'
f'RPS (request per second): {rps:.3f} req/s\n'
f'RPM (request per minute): {rpm:.3f} req/min\n'
f'{"-" * 50}\n')
if self.csv:
......@@ -190,7 +190,7 @@ class Engine:
'completion_tokens', '1st_token_latency(min)(s)',
'1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
'output token thr(tokens/s', 'total token thr(token/s)',
'RPM'
'RPS', 'RPM'
])
writer.writerow([
concurrency,
......@@ -199,7 +199,7 @@ class Engine:
f'{first_token_latency_max:.3f}' if stream_output else '-',
f'{first_token_latency_ave:.3f}' if stream_output else '-',
f'{completion_token_throughput:.3f}',
f'{total_token_throughput:.3f}', f'{rqm:.3f}'
f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
])
......
......@@ -163,8 +163,8 @@ class Engine:
prompt_tokens = total_tokens - completion_tokens
completion_token_throughput = completion_tokens / elapsed_time
total_token_throughput = total_tokens / elapsed_time
rqs = len(requests) / elapsed_time
rqm = rqs * 60
rps = len(requests) / elapsed_time
rpm = rps * 60
if (np.abs(stats[:, 1] - stats[:, 2]) <= 1).min() is False:
print(f'Did not generate requested number of tokens. '
......@@ -183,8 +183,8 @@ class Engine:
f'number of completion tokens: {completion_tokens:.0f}\n'
f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
f'RPS (request per second): {rqs:.3f} req/s\n'
f'RPM (request per minute): {rqm:.3f} req/min\n'
f'RPS (request per second): {rps:.3f} req/s\n'
f'RPM (request per minute): {rpm:.3f} req/min\n'
f'{"-" * 50}\n')
if self.csv:
......@@ -195,7 +195,7 @@ class Engine:
'completion_tokens', '1st_token_latency(min)(s)',
'1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
'output token thr(tokens/s', 'total token thr(token/s)',
'RPM'
'RPS', 'RPM'
])
writer.writerow([
concurrency,
......@@ -204,7 +204,7 @@ class Engine:
f'{first_token_latency_max:.3f}' if stream_output else '-',
f'{first_token_latency_ave:.3f}' if stream_output else '-',
f'{completion_token_throughput:.3f}',
f'{total_token_throughput:.3f}', f'{rqm:.3f}'
f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
])
......
......@@ -75,13 +75,14 @@ class Engine:
stream_output: bool):
model_inst = self.tm_model.create_instance()
stats = []
# get each generated token's latency
per_token_latency_stats = []
for prompt, input_seqlen, output_seqlen in iter(
req_queue.get, [None, None, None]):
_per_token_latency_stats = [0] * (output_seqlen + 1)
offset = 0
timestamps = []
tokens = []
timestamps.append(time.perf_counter())
prev = time.perf_counter()
n_prev_token = 0
input_ids = self.tokenizer(prompt).input_ids
for outputs in model_inst.stream_infer(
......@@ -94,25 +95,32 @@ class Engine:
sequence_end=True,
ignore_eos=True,
stream_output=stream_output):
res, token = outputs[0]
res, n_token = outputs[0]
self.tokenizer.decode(res, offset)
offset = token
timestamps.append(time.perf_counter())
tokens.append(token)
first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
token_latency = np.round(timestamps[-1] - timestamps[0], 3)
completion_tokens = tokens[-1]
assert output_seqlen <= completion_tokens <= output_seqlen + 1, \
offset = n_token
now = time.perf_counter()
if n_prev_token != n_token:
_per_token_latency_stats[n_prev_token] = np.round(
now - prev, 3)
n_prev_token = n_token
prev = now
assert output_seqlen <= n_token <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \
f'tokens, but generate {completion_tokens} tokens.\n' \
f'tokens, but generate {n_token} tokens.\n' \
f'prompt: {prompt}'
total_tokens = tokens[-1] + input_seqlen
first_token_latency = _per_token_latency_stats[0]
completion_tokens = n_token
total_tokens = n_token + input_seqlen
stats.append([
first_token_latency, completion_tokens, output_seqlen,
total_tokens, token_latency
total_tokens
])
# skip the first token latency
per_token_latency_stats.append(_per_token_latency_stats[1:])
self.pbar.update(1)
res_queue.put((session_id, stats))
res_queue.put((session_id, stats, per_token_latency_stats))
def process_request(self,
requests,
......@@ -146,13 +154,15 @@ class Engine:
elapsed_time = time.time() - start
stats = []
per_token_latency_stats = []
while not res_queue.empty():
session_id, _stats = res_queue.get()
# print(f'\n{"-" * 50}\n'
# f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
session_id, _stats, _per_token_latency_stats = res_queue.get()
stats.append(np.array(_stats))
stats = np.concatenate(stats).reshape(-1, 5)
per_token_latency_stats += [
item for sublist in _per_token_latency_stats
for item in sublist
]
stats = np.concatenate(stats).reshape(-1, 4)
first_token_latency_min = np.min(stats[:, 0], axis=0)
first_token_latency_max = np.max(stats[:, 0], axis=0)
......@@ -162,23 +172,33 @@ class Engine:
prompt_tokens = total_tokens - completion_tokens
completion_token_throughput = completion_tokens / elapsed_time
total_token_throughput = total_tokens / elapsed_time
rqs = len(requests) / elapsed_time
rqm = rqs * 60
rps = len(requests) / elapsed_time
rpm = rps * 60
per_token_latency_stats.sort()
percentiles = [
np.round(
per_token_latency_stats[int(percent *
len(per_token_latency_stats))], 3)
for percent in [0.5, 0.75, 0.95, 0.99]
]
print(f'\n{"-" * 50}\nconcurrency: {concurrency}\n'
f'elapsed_time: {elapsed_time:.3f}s\n')
if stream_output:
print(f'first_token latency(min, max, ave): '
f'{first_token_latency_min:.3f}s, '
f'{first_token_latency_max:.3f}s, '
f'{first_token_latency_ave:.3f}s\n')
print(f'first token latency(s)(min, max, ave): '
f'{first_token_latency_min:.3f}, '
f'{first_token_latency_max:.3f}, '
f'{first_token_latency_ave:.3f}')
print(f'per-token latency(s) percentile(50, 75, 95, 99): '
f'{percentiles}\n')
print(
f'number of prompt tokens: {prompt_tokens:.0f}\n'
f'number of completion tokens: {completion_tokens:.0f}\n'
f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa
f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa
f'RPS (request per second): {rqs:.3f} req/s\n'
f'RPM (request per minute): {rqm:.3f} req/min\n'
f'RPS (request per second): {rps:.3f} req/s\n'
f'RPM (request per minute): {rpm:.3f} req/min\n'
f'{"-" * 50}\n')
if self.csv:
......@@ -188,8 +208,9 @@ class Engine:
'batch', 'num_promts', 'prompt_tokens',
'completion_tokens', '1st_token_latency(min)(s)',
'1st_token_latency(max)(s)', '1st_token_latency(ave)(s)',
'output token thr(tokens/s', 'total token thr(token/s)',
'RPM'
'percentile50(s)', 'percentile75(s)', 'percentile95(s)',
'percentile99(s)', 'output token thr(tokens/s)',
'total token thr(token/s)', 'RPS', 'RPM'
])
writer.writerow([
concurrency,
......@@ -197,8 +218,12 @@ class Engine:
f'{first_token_latency_min:.3f}' if stream_output else '-',
f'{first_token_latency_max:.3f}' if stream_output else '-',
f'{first_token_latency_ave:.3f}' if stream_output else '-',
f'{percentiles[0]:.3f}' if stream_output else '-',
f'{percentiles[1]:.3f}' if stream_output else '-',
f'{percentiles[2]:.3f}' if stream_output else '-',
f'{percentiles[3]:.3f}' if stream_output else '-',
f'{completion_token_throughput:.3f}',
f'{total_token_throughput:.3f}', f'{rqm:.3f}'
f'{total_token_throughput:.3f}', f'{rps:.3f}', f'{rpm:.3f}'
])
......
# Benchmark on A100 (FP16)
All the following results are tested on (x8) A100-80G CUDA 11.8.
The tested lmdeploy version is `v0.1.0a1`.
The commands provided below facilitate benchmarking both [static inference performance](#static-inference-benchmark) and [request throughput](#request-throughput-benchmark) on an A100-80G(x8) for models of various sizes.
```shell
bash benchmark/benchmark_7b.sh <the/path/of/llama2-7b/model>
bash benchmark/benchmark_13b.sh <the/path/of/llama2-13b/model>
bash benchmark/benchmark_20b.sh <the/path/of/internlm-20b/model>
bash benchmark/benchmark_70b.sh <the/path/of/llama2-70b/model>
```
## Static Inference Benchmark
FTL: **F**irst **T**oken **L**atency
### llama2-7b
| batch | tp | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
| 1 | 1 | 1 | 128 | 100.02 | 76.55 | 0.011 | 0.01 | 0.011 | 0.009 | 0.009 | 0.01 | 0.011 |
| 1 | 1 | 128 | 128 | 102.21 | 76.59 | 0.022 | 0.022 | 0.022 | 0.01 | 0.01 | 0.01 | 0.01 |
| 1 | 1 | 128 | 2048 | 98.92 | 76.59 | 0.022 | 0.022 | 0.022 | 0.01 | 0.01 | 0.01 | 0.01 |
| 1 | 1 | 2048 | 128 | 86.1 | 76.77 | 0.139 | 0.139 | 0.14 | 0.01 | 0.01 | 0.01 | 0.011 |
| 1 | 1 | 2048 | 2048 | 93.78 | 76.77 | 0.14 | 0.139 | 0.141 | 0.011 | 0.011 | 0.011 | 0.011 |
| 16 | 1 | 1 | 128 | 1504.72 | 76.59 | 0.021 | 0.011 | 0.031 | 0.01 | 0.011 | 0.011 | 0.013 |
| 16 | 1 | 128 | 128 | 1272.47 | 76.77 | 0.129 | 0.023 | 0.149 | 0.011 | 0.011 | 0.012 | 0.014 |
| 16 | 1 | 128 | 2048 | 1010.62 | 76.77 | 0.13 | 0.023 | 0.144 | 0.015 | 0.018 | 0.02 | 0.021 |
| 16 | 1 | 2048 | 128 | 348.87 | 78.3 | 2.897 | 0.143 | 3.576 | 0.02 | 0.021 | 0.022 | 0.025 |
| 16 | 1 | 2048 | 2048 | 601.63 | 78.3 | 2.678 | 0.142 | 3.084 | 0.025 | 0.028 | 0.03 | 0.031 |
| 32 | 1 | 1 | 128 | 2136.73 | 76.62 | 0.079 | 0.014 | 0.725 | 0.011 | 0.012 | 0.013 | 0.021 |
| 32 | 1 | 128 | 128 | 2125.47 | 76.99 | 0.214 | 0.022 | 0.359 | 0.012 | 0.013 | 0.014 | 0.035 |
| 32 | 1 | 128 | 2048 | 1462.12 | 76.99 | 0.2 | 0.026 | 0.269 | 0.021 | 0.026 | 0.031 | 0.033 |
| 32 | 1 | 2048 | 128 | 450.43 | 78.3 | 4.288 | 0.143 | 5.267 | 0.031 | 0.032 | 0.034 | 0.161 |
| 32 | 1 | 2048 | 2048 | 733.34 | 78.34 | 4.118 | 0.19 | 5.429 | 0.04 | 0.045 | 0.05 | 0.053 |
| 64 | 1 | 1 | 128 | 4154.81 | 76.71 | 0.042 | 0.013 | 0.21 | 0.012 | 0.018 | 0.028 | 0.041 |
| 64 | 1 | 128 | 128 | 3024.07 | 77.43 | 0.44 | 0.026 | 1.061 | 0.014 | 0.018 | 0.026 | 0.158 |
| 64 | 1 | 128 | 2048 | 1852.06 | 77.96 | 0.535 | 0.027 | 1.231 | 0.03 | 0.041 | 0.048 | 0.053 |
| 64 | 1 | 2048 | 128 | 493.46 | 78.4 | 6.59 | 0.142 | 16.235 | 0.046 | 0.049 | 0.055 | 0.767 |
| 64 | 1 | 2048 | 2048 | 755.65 | 78.4 | 39.105 | 0.142 | 116.285 | 0.047 | 0.049 | 0.051 | 0.207 |
### llama2-13b
| batch | tp | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
| 1 | 1 | 1 | 128 | 57.49 | 74.84 | 0.018 | 0.018 | 0.019 | 0.017 | 0.017 | 0.017 | 0.017 |
| 1 | 1 | 128 | 128 | 56.58 | 74.84 | 0.04 | 0.039 | 0.04 | 0.017 | 0.017 | 0.017 | 0.018 |
| 1 | 1 | 128 | 2048 | 55.29 | 74.84 | 0.04 | 0.04 | 0.04 | 0.018 | 0.018 | 0.018 | 0.019 |
| 1 | 1 | 2048 | 128 | 48.99 | 75.09 | 0.242 | 0.242 | 0.243 | 0.019 | 0.019 | 0.019 | 0.019 |
| 1 | 1 | 2048 | 2048 | 52.12 | 75.09 | 0.243 | 0.24 | 0.244 | 0.019 | 0.019 | 0.019 | 0.02 |
| 16 | 1 | 1 | 128 | 869.45 | 74.87 | 0.036 | 0.019 | 0.053 | 0.018 | 0.019 | 0.019 | 0.02 |
| 16 | 1 | 128 | 128 | 757.3 | 75.09 | 0.252 | 0.041 | 0.272 | 0.019 | 0.02 | 0.02 | 0.021 |
| 16 | 1 | 128 | 2048 | 605.88 | 75.09 | 0.253 | 0.041 | 0.275 | 0.026 | 0.03 | 0.033 | 0.034 |
| 16 | 1 | 2048 | 128 | 257.92 | 76.96 | 3.442 | 0.245 | 3.668 | 0.033 | 0.034 | 0.035 | 0.035 |
| 16 | 1 | 2048 | 2048 | 366.67 | 76.99 | 3.122 | 0.249 | 3.671 | 0.04 | 0.044 | 0.047 | 0.047 |
| 32 | 1 | 1 | 128 | 1667.5 | 74.9 | 0.034 | 0.021 | 0.057 | 0.019 | 0.02 | 0.021 | 0.023 |
| 32 | 1 | 128 | 128 | 1301.27 | 75.37 | 0.461 | 0.04 | 0.497 | 0.021 | 0.022 | 0.023 | 0.025 |
| 32 | 1 | 128 | 2048 | 860.14 | 75.84 | 0.833 | 0.041 | 1.151 | 0.034 | 0.042 | 0.047 | 0.048 |
| 32 | 1 | 2048 | 128 | 291.54 | 77.02 | 5.315 | 0.245 | 13.483 | 0.046 | 0.047 | 0.049 | 0.51 |
| 32 | 1 | 2048 | 2048 | 389.64 | 77.02 | 38.725 | 0.245 | 108.104 | 0.047 | 0.047 | 0.049 | 0.05 |
| 64 | 1 | 1 | 128 | 3049.16 | 74.96 | 0.044 | 0.025 | 0.073 | 0.02 | 0.022 | 0.026 | 0.029 |
| 64 | 1 | 128 | 128 | 2033.22 | 75.87 | 0.703 | 0.046 | 0.951 | 0.024 | 0.026 | 0.029 | 0.032 |
| 64 | 1 | 128 | 2048 | 998.86 | 76.9 | 7.805 | 0.042 | 60.1 | 0.045 | 0.047 | 0.05 | 0.063 |
| 64 | 1 | 2048 | 128 | 286.32 | 76.99 | 19.69 | 0.245 | 32.394 | 0.047 | 0.048 | 0.05 | 0.27 |
| 64 | 1 | 2048 | 2048 | 387.86 | 77.09 | 190.453 | 0.245 | 307.331 | 0.047 | 0.048 | 0.049 | 0.05 |
### internlm-20b
| batch | tp | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
| 1 | 2 | 1 | 128 | 61.14 | 73.55 | 0.018 | 0.017 | 0.019 | 0.016 | 0.016 | 0.016 | 0.018 |
| 1 | 2 | 128 | 128 | 60.03 | 73.55 | 0.042 | 0.041 | 0.043 | 0.016 | 0.016 | 0.016 | 0.017 |
| 1 | 2 | 128 | 2048 | 58.26 | 73.55 | 0.042 | 0.042 | 0.043 | 0.017 | 0.017 | 0.018 | 0.018 |
| 1 | 2 | 2048 | 128 | 51.93 | 73.68 | 0.217 | 0.216 | 0.217 | 0.018 | 0.018 | 0.018 | 0.018 |
| 1 | 2 | 2048 | 2048 | 56.36 | 73.68 | 0.217 | 0.217 | 0.217 | 0.018 | 0.018 | 0.018 | 0.018 |
| 16 | 2 | 1 | 128 | 903.01 | 73.65 | 0.034 | 0.018 | 0.051 | 0.017 | 0.018 | 0.019 | 0.02 |
| 16 | 2 | 128 | 128 | 794.13 | 73.74 | 0.227 | 0.043 | 0.248 | 0.018 | 0.019 | 0.02 | 0.021 |
| 16 | 2 | 128 | 2048 | 669.87 | 73.74 | 0.227 | 0.043 | 0.25 | 0.024 | 0.027 | 0.029 | 0.03 |
| 16 | 2 | 2048 | 128 | 288.60 | 75.60 | 3.09 | 0.247 | 4.485 | 0.029 | 0.03 | 0.031 | 0.032 |
| 16 | 2 | 2048 | 2048 | 441.46 | 75.61 | 3.172 | 0.219 | 4.442 | 0.035 | 0.037 | 0.04 | 0.041 |
| 32 | 2 | 1 | 128 | 1673.64 | 73.71 | 0.037 | 0.02 | 0.066 | 0.019 | 0.02 | 0.021 | 0.023 |
| 32 | 2 | 128 | 128 | 1347.57 | 73.90 | 0.351 | 0.043 | 0.436 | 0.02 | 0.021 | 0.023 | 0.025 |
| 32 | 2 | 128 | 2048 | 1025.62 | 73.90 | 0.391 | 0.042 | 0.441 | 0.031 | 0.037 | 0.041 | 0.043 |
| 32 | 2 | 2048 | 128 | 352.45 | 75.74 | 6.062 | 0.218 | 6.3 | 0.042 | 0.043 | 0.045 | 0.046 |
| 32 | 2 | 2048 | 2048 | 514.60 | 75.77 | 10.36 | 0.222 | 70.328 | 0.049 | 0.05 | 0.051 | 0.053 |
| 64 | 2 | 1 | 128 | 2954.34 | 73.82 | 0.05 | 0.029 | 0.074 | 0.021 | 0.023 | 0.026 | 0.03 |
| 64 | 2 | 128 | 128 | 2122.92 | 74.24 | 0.591 | 0.047 | 0.808 | 0.024 | 0.026 | 0.029 | 0.032 |
| 64 | 2 | 128 | 2048 | 1276.61 | 75.18 | 2.529 | 0.049 | 41.212 | 0.042 | 0.048 | 0.052 | 0.055 |
| 64 | 2 | 2048 | 128 | 350.82 | 75.88 | 12.382 | 0.219 | 20.986 | 0.05 | 0.051 | 0.054 | 0.249 |
| 64 | 2 | 2048 | 2048 | 512.37 | 76.26 | 111.149 | 0.221 | 211.531 | 0.05 | 0.051 | 0.052 | 0.055 |
### llama2-70b
| batch | tp | prompt_tokens | output_tokens | throughput(out tok/s) | mem(GB) | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | 50%(s) | 75%(s) | 95%(s) | 99%(s) |
| ----- | --- | ------------- | ------------- | --------------------- | ------- | ----------- | ----------- | ----------- | ------ | ------ | ------ | ------ |
| 1 | 4 | 1 | 128 | 33.94 | 73.72 | 0.031 | 0.03 | 0.031 | 0.029 | 0.029 | 0.029 | 0.03 |
| 1 | 4 | 128 | 128 | 33.63 | 73.72 | 0.074 | 0.073 | 0.074 | 0.029 | 0.029 | 0.029 | 0.03 |
| 1 | 4 | 128 | 2048 | 32.38 | 73.72 | 0.074 | 0.074 | 0.075 | 0.031 | 0.031 | 0.031 | 0.031 |
| 1 | 4 | 2048 | 128 | 28.32 | 73.78 | 0.402 | 0.401 | 0.403 | 0.031 | 0.031 | 0.031 | 0.051 |
| 1 | 4 | 2048 | 2048 | 31.9 | 73.78 | 0.405 | 0.402 | 0.407 | 0.031 | 0.031 | 0.031 | 0.031 |
| 16 | 4 | 1 | 128 | 468.52 | 73.72 | 0.071 | 0.034 | 0.939 | 0.03 | 0.031 | 0.032 | 0.251 |
| 16 | 4 | 128 | 128 | 439.77 | 73.81 | 0.437 | 0.08 | 0.687 | 0.03 | 0.031 | 0.032 | 0.207 |
| 16 | 4 | 128 | 2048 | 482.99 | 73.81 | 0.403 | 0.079 | 0.44 | 0.033 | 0.033 | 0.035 | 0.036 |
| 16 | 4 | 2048 | 128 | 189.34 | 73.98 | 5.776 | 0.437 | 7.612 | 0.035 | 0.036 | 0.036 | 0.037 |
| 16 | 4 | 2048 | 2048 | 399.42 | 73.98 | 5.773 | 0.411 | 6.844 | 0.036 | 0.037 | 0.038 | 0.041 |
| 32 | 4 | 1 | 128 | 906.03 | 73.75 | 0.098 | 0.043 | 0.253 | 0.032 | 0.033 | 0.035 | 0.178 |
| 32 | 4 | 128 | 128 | 746.36 | 73.91 | 0.749 | 0.078 | 1.026 | 0.032 | 0.033 | 0.035 | 0.438 |
| 32 | 4 | 128 | 2048 | 853.56 | 73.91 | 0.732 | 0.076 | 1.129 | 0.036 | 0.038 | 0.041 | 0.158 |
| 32 | 4 | 2048 | 128 | 232.6 | 73.99 | 11.834 | 0.408 | 13.321 | 0.04 | 0.041 | 0.043 | 0.248 |
| 32 | 4 | 2048 | 2048 | 636.23 | 73.99 | 11.711 | 0.409 | 12.689 | 0.043 | 0.045 | 0.048 | 0.179 |
| 64 | 4 | 1 | 128 | 1425.79 | 73.81 | 0.213 | 0.046 | 1.264 | 0.037 | 0.039 | 0.044 | 0.329 |
| 64 | 4 | 128 | 128 | 1159.84 | 73.96 | 1.292 | 0.107 | 2.676 | 0.037 | 0.04 | 0.045 | 0.378 |
| 64 | 4 | 128 | 2048 | 1391.8 | 73.95 | 1.173 | 0.135 | 1.623 | 0.043 | 0.047 | 0.052 | 0.251 |
| 64 | 4 | 2048 | 128 | 270.47 | 74.02 | 17.402 | 0.452 | 24.164 | 0.05 | 0.052 | 0.057 | 0.345 |
| 64 | 4 | 2048 | 2048 | 930.46 | 74.01 | 21.29 | 0.423 | 24.498 | 0.055 | 0.059 | 0.065 | 0.299 |
## Request Throughput Benchmark
FTL: **F**irst **T**oken **L**atency
| model | batch | tp | num_prompts | PRS | PRM | FTL(ave)(s) | FTL(min)(s) | FTL(max)(s) | throughput(out tok/s) | throughput(total tok/s) |
| ------------ | ----- | --- | ----------- | ------ | ------- | ----------- | ----------- | ----------- | --------------------- | ----------------------- |
| llama2-7b | 64 | 1 | 3000 | 10.275 | 616.477 | 0.092 | 0.036 | 1.145 | 2562.435 | 5283.547 |
| | 128 | 1 | 3000 | 12.611 | 756.677 | 0.205 | 0.056 | 2.241 | 3210.281 | 6619.357 |
| llama2-13b | 64 | 1 | 3000 | 6.337 | 380.244 | 0.159 | 0.051 | 2.048 | 1474.786 | 3039.398 |
| | 128 | 1 | 3000 | 7.588 | 455.273 | 0.412 | 0.085 | 4.445 | 1765.788 | 3639.128 |
| internlm-20b | 64 | 2 | 3000 | 7.842 | 470.516 | 0.166 | 0.059 | 2.461 | 1564.696 | 3311.16 |
| | 128 | 2 | 3000 | 9.776 | 586.568 | 0.34 | 0.079 | 5.808 | 1950.627 | 4127.855 |
| llama2-70b | 64 | 4 | 3000 | 4.285 | 257.08 | 0.301 | 0.083 | 4.689 | 1000.376 | 2062.7 |
| | 128 | 4 | 3000 | 5.833 | 349.996 | 0.633 | 0.107 | 8.431 | 1361.939 | 2808.216 |
| | 256 | 4 | 3000 | 6.568 | 394.108 | 1.49 | 0.171 | 19.52 | 1533.592 | 3162.15 |
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment