Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
a4211a4d
Unverified
Commit
a4211a4d
authored
Feb 12, 2024
by
Roger Wang
Committed by
GitHub
Feb 12, 2024
Browse files
Serving Benchmark Refactoring (#2433)
parent
56383649
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
553 additions
and
125 deletions
+553
-125
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+10
-4
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+284
-0
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+258
-120
benchmarks/launch_tgi_server.sh
benchmarks/launch_tgi_server.sh
+1
-1
No files found.
.buildkite/run-benchmarks.sh
View file @
a4211a4d
...
@@ -6,15 +6,16 @@ set -o pipefail
...
@@ -6,15 +6,16 @@ set -o pipefail
# cd into parent directory of this file
# cd into parent directory of this file
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
/.."
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
/.."
(
w
get
&&
curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
w
hich wget
&&
which
curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
# run benchmarks and upload the result to buildkite
# run
python-based
benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
bench_latency_exit_code
=
$?
bench_latency_exit_code
=
$?
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
bench_throughput_exit_code
=
$?
bench_throughput_exit_code
=
$?
# run server-based benchmarks and upload the result to buildkite
python3
-m
vllm.entrypoints.openai.api_server
--model
meta-llama/Llama-2-7b-chat-hf &
python3
-m
vllm.entrypoints.openai.api_server
--model
meta-llama/Llama-2-7b-chat-hf &
server_pid
=
$!
server_pid
=
$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
...
@@ -22,11 +23,14 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
...
@@ -22,11 +23,14 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
# wait for server to start, timeout after 600 seconds
timeout
600 bash
-c
'until curl localhost:8000/v1/models; do sleep 1; done'
||
exit
1
timeout
600 bash
-c
'until curl localhost:8000/v1/models; do sleep 1; done'
||
exit
1
python3 benchmarks/benchmark_serving.py
\
python3 benchmarks/benchmark_serving.py
\
--backend
openai
\
--dataset
./ShareGPT_V3_unfiltered_cleaned_split.json
\
--dataset
./ShareGPT_V3_unfiltered_cleaned_split.json
\
--model
meta-llama/Llama-2-7b-chat-hf
\
--model
meta-llama/Llama-2-7b-chat-hf
\
--num-prompts
20
\
--num-prompts
20
\
--endpoint
/v1/completions
\
--endpoint
/v1/completions
\
--tokenizer
meta-llama/Llama-2-7b-chat-hf 2>&1 |
tee
benchmark_serving.txt
--tokenizer
meta-llama/Llama-2-7b-chat-hf
\
--save-result
\
2>&1 |
tee
benchmark_serving.txt
bench_serving_exit_code
=
$?
bench_serving_exit_code
=
$?
kill
$server_pid
kill
$server_pid
...
@@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
...
@@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo
"### Serving Benchmarks"
>>
benchmark_results.md
echo
"### Serving Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_serving.txt
>>
benchmark_results.md
# first line
sed
-n
'1p'
benchmark_serving.txt
>>
benchmark_results.md
# first line
echo
""
>>
benchmark_results.md
echo
""
>>
benchmark_results.md
tail
-n
5
benchmark_serving.txt
>>
benchmark_results.md
# last
5
lines
tail
-n
13
benchmark_serving.txt
>>
benchmark_results.md
# last
13
lines
# upload the results to buildkite
# upload the results to buildkite
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
...
@@ -61,3 +65,5 @@ fi
...
@@ -61,3 +65,5 @@ fi
if
[
$bench_serving_exit_code
-ne
0
]
;
then
if
[
$bench_serving_exit_code
-ne
0
]
;
then
exit
$bench_serving_exit_code
exit
$bench_serving_exit_code
fi
fi
/workspace/buildkite-agent artifact upload openai-
*
.json
benchmarks/backend_request_func.py
0 → 100644
View file @
a4211a4d
import
json
import
os
import
time
from
dataclasses
import
dataclass
from
typing
import
Optional
import
aiohttp
from
tqdm.asyncio
import
tqdm
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
best_of
:
int
=
1
use_beam_search
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0
ttft
:
float
=
0
prompt_len
:
int
=
0
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
params
=
{
"best_of"
:
request_func_input
.
best_of
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
data
.
decode
(
"utf-8"
).
lstrip
(
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"generated_text"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_vllm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"n"
:
1
,
"best_of"
:
request_func_input
.
best_of
,
"use_beam_search"
:
request_func_input
.
use_beam_search
,
"temperature"
:
0.0
if
request_func_input
.
use_beam_search
else
1.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"stream"
:
True
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
# When streaming, '\0' is appended to the end of the response.
body
=
data
.
decode
(
"utf-8"
).
strip
(
"
\0
"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text"
][
0
][
len
(
request_func_input
.
prompt
):]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
assert
request_func_input
.
best_of
==
1
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
async
for
data
in
resp
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
data
.
decode
(
"utf-8"
).
lstrip
(
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text_output"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
request_func_input
.
best_of
==
1
assert
not
request_func_input
.
use_beam_search
payload
=
{
"prompts"
:
request_func_input
.
prompt
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temperature.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
# https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
parsed_resp
=
await
resp
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
output
.
generated_text
=
parsed_resp
[
0
][
"generated_text"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"v1/completions"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
payload
=
{
"model"
:
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"best_of"
:
request_func_input
.
best_of
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk
in
response
.
content
:
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
chunk
=
chunk
.
decode
(
"utf-8"
).
lstrip
(
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
body
=
json
.
loads
(
chunk
)
generated_text
+=
body
[
"choices"
][
0
][
"text"
]
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_vllm
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
}
benchmarks/benchmark_serving.py
View file @
a4211a4d
...
@@ -20,16 +20,36 @@ import asyncio
...
@@ -20,16 +20,36 @@ import asyncio
import
json
import
json
import
random
import
random
import
time
import
time
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
AsyncGenerator
,
List
,
Tuple
from
typing
import
AsyncGenerator
,
List
,
Tuple
import
aiohttp
import
numpy
as
np
import
numpy
as
np
from
tqdm.asyncio
import
tqdm
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
PreTrainedTokenizerBase
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
# (prompt len, output len, latency)
from
backend_request_func
import
(
REQUEST_LATENCY
:
List
[
Tuple
[
int
,
int
,
float
]]
=
[]
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
RequestFuncOutput
,
)
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
input_throughput
:
float
output_throughput
:
float
mean_ttft_ms
:
float
median_ttft_ms
:
float
p99_ttft_ms
:
float
mean_tpot_ms
:
float
median_tpot_ms
:
float
p99_tpot_ms
:
float
def
sample_requests
(
def
sample_requests
(
...
@@ -46,6 +66,11 @@ def sample_requests(
...
@@ -46,6 +66,11 @@ def sample_requests(
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# some of these will be filtered out, so sample more than we need
sampled_indices
=
random
.
sample
(
range
(
len
(
dataset
)),
int
(
num_requests
*
1.2
))
dataset
=
[
dataset
[
i
]
for
i
in
sampled_indices
]
# Tokenize the prompts and completions.
# Tokenize the prompts and completions.
prompts
=
[
prompt
for
prompt
,
_
in
dataset
]
prompts
=
[
prompt
for
prompt
,
_
in
dataset
]
prompt_token_ids
=
tokenizer
(
prompts
).
input_ids
prompt_token_ids
=
tokenizer
(
prompts
).
input_ids
...
@@ -92,80 +117,125 @@ async def get_request(
...
@@ -92,80 +117,125 @@ async def get_request(
await
asyncio
.
sleep
(
interval
)
await
asyncio
.
sleep
(
interval
)
async
def
send_request
(
backend
:
str
,
model
:
str
,
api_url
:
str
,
prompt
:
str
,
def
calculate_metrics
(
prompt_len
:
int
,
output_len
:
int
,
best_of
:
int
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
use_beam_search
:
bool
,
pbar
:
tqdm
)
->
None
:
outputs
:
List
[
RequestFuncOutput
],
request_start_time
=
time
.
perf_counter
()
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
headers
=
{
"User-Agent"
:
"Benchmark Client"
}
)
->
BenchmarkMetrics
:
if
backend
==
"vllm"
:
total_output
=
0
pload
=
{
total_input
=
0
"prompt"
:
prompt
,
completed
=
0
"n"
:
1
,
per_token_latencies
=
[]
"best_of"
:
best_of
,
ttfts
=
[]
"use_beam_search"
:
use_beam_search
,
for
i
in
range
(
len
(
outputs
)):
"temperature"
:
0.0
if
use_beam_search
else
1.0
,
if
outputs
[
i
].
success
:
"top_p"
:
1.0
,
output_len
=
len
(
tokenizer
.
encode
(
outputs
[
i
].
generated_text
))
"max_tokens"
:
output_len
,
total_output
+=
output_len
"ignore_eos"
:
True
,
total_input
+=
input_requests
[
i
][
1
]
"stream"
:
False
,
per_token_latencies
.
append
(
outputs
[
i
].
latency
/
output_len
)
}
ttfts
.
append
(
outputs
[
i
].
ttft
)
if
model
is
not
None
:
completed
+=
1
pload
[
"model"
]
=
model
elif
backend
==
"tgi"
:
assert
not
use_beam_search
params
=
{
"best_of"
:
best_of
,
"max_new_tokens"
:
output_len
,
"do_sample"
:
True
,
}
pload
=
{
"inputs"
:
prompt
,
"parameters"
:
params
,
}
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
timeout
=
aiohttp
.
ClientTimeout
(
total
=
3
*
3600
)
async
with
aiohttp
.
ClientSession
(
timeout
=
timeout
)
as
session
:
while
True
:
async
with
session
.
post
(
api_url
,
headers
=
headers
,
json
=
pload
)
as
response
:
chunks
=
[]
async
for
chunk
,
_
in
response
.
content
.
iter_chunks
():
chunks
.
append
(
chunk
)
output
=
b
""
.
join
(
chunks
).
decode
(
"utf-8"
)
output
=
json
.
loads
(
output
)
# Re-send the request if it failed.
metrics
=
BenchmarkMetrics
(
if
"error"
not
in
output
:
completed
=
completed
,
break
total_input
=
total_input
,
total_output
=
total_output
,
request_throughput
=
completed
/
dur_s
,
input_throughput
=
total_input
/
dur_s
,
output_throughput
=
total_output
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
)
*
1000
,
p99_ttft_ms
=
np
.
percentile
(
ttfts
,
99
)
*
1000
,
mean_tpot_ms
=
np
.
mean
(
per_token_latencies
)
*
1000
,
median_tpot_ms
=
np
.
median
(
per_token_latencies
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
per_token_latencies
,
99
)
*
1000
,
)
request_end_time
=
time
.
perf_counter
()
return
metrics
request_latency
=
request_end_time
-
request_start_time
REQUEST_LATENCY
.
append
((
prompt_len
,
output_len
,
request_latency
))
pbar
.
update
(
1
)
async
def
benchmark
(
async
def
benchmark
(
backend
:
str
,
backend
:
str
,
model
:
str
,
api_url
:
str
,
api_url
:
str
,
model_id
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
best_of
:
int
,
best_of
:
int
,
use_beam_search
:
bool
,
use_beam_search
:
bool
,
request_rate
:
float
,
request_rate
:
float
,
)
->
None
:
disable_tqdm
:
bool
,
tasks
:
List
[
asyncio
.
Task
]
=
[]
):
pbar
=
tqdm
(
total
=
len
(
input_requests
))
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
.
get
(
backend
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
async
for
request
in
get_request
(
input_requests
,
request_rate
):
prompt
,
prompt_len
,
output_len
=
request
prompt
,
prompt_len
,
output_len
=
request
task
=
asyncio
.
create_task
(
request_func_input
=
RequestFuncInput
(
send_request
(
backend
,
model
,
api_url
,
prompt
,
prompt_len
,
model
=
model_id
,
output_len
,
best_of
,
use_beam_search
,
pbar
))
prompt
=
prompt
,
tasks
.
append
(
task
)
api_url
=
api_url
,
await
asyncio
.
gather
(
*
tasks
)
prompt_len
=
prompt_len
,
pbar
.
close
()
output_len
=
output_len
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
)
tasks
.
append
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
=
await
asyncio
.
gather
(
*
tasks
)
if
not
disable_tqdm
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
)
print
(
f
"Successful requests:
{
metrics
.
completed
}
"
)
print
(
f
"Benchmark duration:
{
benchmark_duration
:
2
f
}
s"
)
print
(
f
"Total input tokens:
{
metrics
.
total_input
}
"
)
print
(
f
"Total generated tokens:
{
metrics
.
total_output
}
"
)
print
(
f
"Request throughput:
{
metrics
.
request_throughput
:.
2
f
}
requests/s"
)
print
(
f
"Input token throughput:
{
metrics
.
input_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Output token throughput:
{
metrics
.
output_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Mean TTFT:
{
metrics
.
mean_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Median TTFT:
{
metrics
.
median_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TTFT:
{
metrics
.
p99_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Mean TPOT:
{
metrics
.
mean_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"Median TPOT:
{
metrics
.
median_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TPOT:
{
metrics
.
p99_tpot_ms
:.
2
f
}
ms"
)
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_inthroughput"
:
metrics
.
request_throughput
,
"input_throughput"
:
metrics
.
input_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"mean_ttft_ms"
:
metrics
.
mean_ttft_ms
,
"median_ttft_ms"
:
metrics
.
median_ttft_ms
,
"p99_ttft_ms"
:
metrics
.
p99_ttft_ms
,
"mean_tpot_ms"
:
metrics
.
mean_tpot_ms
,
"median_tpot_ms"
:
metrics
.
median_tpot_ms
,
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
}
return
result
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
...
@@ -173,77 +243,145 @@ def main(args: argparse.Namespace):
...
@@ -173,77 +243,145 @@ def main(args: argparse.Namespace):
random
.
seed
(
args
.
seed
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
api_url
=
f
"
{
args
.
protocol
}
://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
backend
=
args
.
backend
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
model_id
=
args
.
model
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
args
.
model
if
args
.
base_url
is
not
None
:
api_url
=
f
"
{
args
.
base_url
}{
args
.
endpoint
}
"
else
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
tokenizer
=
get_tokenizer
(
tokenizer_id
,
trust_remote_code
=
args
.
trust_remote_code
)
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
benchmark_start_time
=
time
.
perf_counter
()
benchmark_result
=
asyncio
.
run
(
asyncio
.
run
(
benchmark
(
benchmark
(
args
.
backend
,
args
.
model
,
api_url
,
input_requests
,
backend
=
backend
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
api_url
=
api_url
,
benchmark_end_time
=
time
.
perf_counter
()
model_id
=
model_id
,
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
tokenizer
=
tokenizer
,
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
input_requests
=
input_requests
,
print
(
f
"Throughput:
{
args
.
num_prompts
/
benchmark_time
:.
2
f
}
requests/s"
)
best_of
=
args
.
best_of
,
use_beam_search
=
args
.
use_beam_search
,
# Compute the latency statistics.
request_rate
=
args
.
request_rate
,
avg_latency
=
np
.
mean
([
latency
for
_
,
_
,
latency
in
REQUEST_LATENCY
])
disable_tqdm
=
args
.
disable_tqdm
,
print
(
f
"Average latency:
{
avg_latency
:.
2
f
}
s"
)
))
avg_per_token_latency
=
np
.
mean
([
latency
/
(
prompt_len
+
output_len
)
# Save config and results to json
for
prompt_len
,
output_len
,
latency
in
REQUEST_LATENCY
if
args
.
save_result
:
])
result_json
=
{}
print
(
f
"Average latency per token:
{
avg_per_token_latency
:.
2
f
}
s"
)
avg_per_output_token_latency
=
np
.
mean
(
# Setup
[
latency
/
output_len
for
_
,
output_len
,
latency
in
REQUEST_LATENCY
])
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
print
(
"Average latency per output token: "
result_json
[
"date"
]
=
current_dt
f
"
{
avg_per_output_token_latency
:.
2
f
}
s"
)
result_json
[
"backend"
]
=
backend
result_json
[
"version"
]
=
args
.
version
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"best_of"
]
=
args
.
best_of
result_json
[
"use_beam_search"
]
=
args
.
use_beam_search
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
with
open
(
file_name
,
"w"
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
parser
.
add_argument
(
type
=
str
,
"--backend"
,
default
=
"vllm"
,
type
=
str
,
choices
=
[
"vllm"
,
"tgi"
])
default
=
"vllm"
,
parser
.
add_argument
(
"--protocol"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
type
=
str
,
)
default
=
"http"
,
parser
.
add_argument
(
choices
=
[
"http"
,
"https"
])
"--version"
,
type
=
str
,
default
=
"N/A"
,
help
=
"Version of the serving backend/engine."
,
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
default
=
None
,
help
=
"Server or API base url if not using http host and port."
,
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/generate"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
None
)
"--endpoint"
,
type
=
str
,
default
=
"/generate"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset"
,
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
type
=
str
,
required
=
True
,
required
=
True
,
help
=
"Path to the dataset."
)
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--tokenizer"
,
parser
.
add_argument
(
type
=
str
,
"--model"
,
required
=
True
,
type
=
str
,
help
=
"Name or path of the tokenizer."
)
required
=
True
,
parser
.
add_argument
(
"--best-of"
,
help
=
"Name of the model."
,
type
=
int
,
)
default
=
1
,
parser
.
add_argument
(
help
=
"Generates `best_of` sequences per prompt and "
"--tokenizer"
,
"returns the best one."
)
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default model tokenizer."
,
)
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
help
=
"Generates `best_of` sequences per prompt and "
"returns the best one."
,
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
parser
.
add_argument
(
type
=
int
,
"--num-prompts"
,
default
=
1000
,
type
=
int
,
help
=
"Number of prompts to process."
)
default
=
1000
,
parser
.
add_argument
(
"--request-rate"
,
help
=
"Number of prompts to process."
,
type
=
float
,
)
default
=
float
(
"inf"
),
parser
.
add_argument
(
help
=
"Number of requests per second. If this is inf, "
"--request-rate"
,
"then all the requests are sent at time 0. "
type
=
float
,
"Otherwise, we use Poisson process to synthesize "
default
=
float
(
"inf"
),
"the request arrival times."
)
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times."
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--trust-remote-code'
,
parser
.
add_argument
(
action
=
'store_true'
,
"--trust-remote-code"
,
help
=
'trust remote code from huggingface'
)
action
=
"store_true"
,
help
=
"Trust remote code from huggingface"
,
)
parser
.
add_argument
(
"--disable-tqdm"
,
action
=
"store_true"
,
help
=
"Specify to disbale tqdm progress bar."
,
)
parser
.
add_argument
(
"--save-result"
,
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/launch_tgi_server.sh
View file @
a4211a4d
...
@@ -6,7 +6,7 @@ TOKENS=$2
...
@@ -6,7 +6,7 @@ TOKENS=$2
docker run
--gpus
all
--shm-size
1g
-p
$PORT
:80
\
docker run
--gpus
all
--shm-size
1g
-p
$PORT
:80
\
-v
$PWD
/data:/data
\
-v
$PWD
/data:/data
\
ghcr.io/huggingface/text-generation-inference:
0.8
\
ghcr.io/huggingface/text-generation-inference:
1.4.0
\
--model-id
$MODEL
\
--model-id
$MODEL
\
--sharded
false
\
--sharded
false
\
--max-input-length
1024
\
--max-input-length
1024
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment