Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
a4211a4d
Unverified
Commit
a4211a4d
authored
Feb 12, 2024
by
Roger Wang
Committed by
GitHub
Feb 12, 2024
Browse files
Serving Benchmark Refactoring (#2433)
parent
56383649
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
553 additions
and
125 deletions
+553
-125
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+10
-4
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+284
-0
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+258
-120
benchmarks/launch_tgi_server.sh
benchmarks/launch_tgi_server.sh
+1
-1
No files found.
.buildkite/run-benchmarks.sh
View file @
a4211a4d
...
...
@@ -6,15 +6,16 @@ set -o pipefail
# cd into parent directory of this file
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
/.."
(
w
get
&&
curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
w
hich wget
&&
which
curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
# run benchmarks and upload the result to buildkite
# run
python-based
benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
bench_latency_exit_code
=
$?
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
bench_throughput_exit_code
=
$?
# run server-based benchmarks and upload the result to buildkite
python3
-m
vllm.entrypoints.openai.api_server
--model
meta-llama/Llama-2-7b-chat-hf &
server_pid
=
$!
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
...
...
@@ -22,11 +23,14 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout
600 bash
-c
'until curl localhost:8000/v1/models; do sleep 1; done'
||
exit
1
python3 benchmarks/benchmark_serving.py
\
--backend
openai
\
--dataset
./ShareGPT_V3_unfiltered_cleaned_split.json
\
--model
meta-llama/Llama-2-7b-chat-hf
\
--num-prompts
20
\
--endpoint
/v1/completions
\
--tokenizer
meta-llama/Llama-2-7b-chat-hf 2>&1 |
tee
benchmark_serving.txt
--tokenizer
meta-llama/Llama-2-7b-chat-hf
\
--save-result
\
2>&1 |
tee
benchmark_serving.txt
bench_serving_exit_code
=
$?
kill
$server_pid
...
...
@@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo
"### Serving Benchmarks"
>>
benchmark_results.md
sed
-n
'1p'
benchmark_serving.txt
>>
benchmark_results.md
# first line
echo
""
>>
benchmark_results.md
tail
-n
5
benchmark_serving.txt
>>
benchmark_results.md
# last
5
lines
tail
-n
13
benchmark_serving.txt
>>
benchmark_results.md
# last
13
lines
# upload the results to buildkite
/workspace/buildkite-agent annotate
--style
"info"
--context
"benchmark-results"
< benchmark_results.md
...
...
@@ -61,3 +65,5 @@ fi
if
[
$bench_serving_exit_code
-ne
0
]
;
then
exit
$bench_serving_exit_code
fi
/workspace/buildkite-agent artifact upload openai-
*
.json
benchmarks/backend_request_func.py
0 → 100644
View file @
a4211a4d
import
json
import
os
import
time
from
dataclasses
import
dataclass
from
typing
import
Optional
import
aiohttp
from
tqdm.asyncio
import
tqdm
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
best_of
:
int
=
1
use_beam_search
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0
ttft
:
float
=
0
prompt_len
:
int
=
0
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
params
=
{
"best_of"
:
request_func_input
.
best_of
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
data
.
decode
(
"utf-8"
).
lstrip
(
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"generated_text"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_vllm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"n"
:
1
,
"best_of"
:
request_func_input
.
best_of
,
"use_beam_search"
:
request_func_input
.
use_beam_search
,
"temperature"
:
0.0
if
request_func_input
.
use_beam_search
else
1.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"stream"
:
True
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
# When streaming, '\0' is appended to the end of the response.
body
=
data
.
decode
(
"utf-8"
).
strip
(
"
\0
"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text"
][
0
][
len
(
request_func_input
.
prompt
):]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
assert
request_func_input
.
best_of
==
1
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
async
for
data
in
resp
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
data
.
decode
(
"utf-8"
).
lstrip
(
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text_output"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
request_func_input
.
best_of
==
1
assert
not
request_func_input
.
use_beam_search
payload
=
{
"prompts"
:
request_func_input
.
prompt
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temperature.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
# https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
parsed_resp
=
await
resp
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
output
.
generated_text
=
parsed_resp
[
0
][
"generated_text"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"v1/completions"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
payload
=
{
"model"
:
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"best_of"
:
request_func_input
.
best_of
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk
in
response
.
content
:
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
chunk
=
chunk
.
decode
(
"utf-8"
).
lstrip
(
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
body
=
json
.
loads
(
chunk
)
generated_text
+=
body
[
"choices"
][
0
][
"text"
]
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_vllm
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
}
benchmarks/benchmark_serving.py
View file @
a4211a4d
...
...
@@ -20,16 +20,36 @@ import asyncio
import
json
import
random
import
time
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
AsyncGenerator
,
List
,
Tuple
import
aiohttp
import
numpy
as
np
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
# (prompt len, output len, latency)
REQUEST_LATENCY
:
List
[
Tuple
[
int
,
int
,
float
]]
=
[]
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
RequestFuncOutput
,
)
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
input_throughput
:
float
output_throughput
:
float
mean_ttft_ms
:
float
median_ttft_ms
:
float
p99_ttft_ms
:
float
mean_tpot_ms
:
float
median_tpot_ms
:
float
p99_tpot_ms
:
float
def
sample_requests
(
...
...
@@ -46,6 +66,11 @@ def sample_requests(
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# some of these will be filtered out, so sample more than we need
sampled_indices
=
random
.
sample
(
range
(
len
(
dataset
)),
int
(
num_requests
*
1.2
))
dataset
=
[
dataset
[
i
]
for
i
in
sampled_indices
]
# Tokenize the prompts and completions.
prompts
=
[
prompt
for
prompt
,
_
in
dataset
]
prompt_token_ids
=
tokenizer
(
prompts
).
input_ids
...
...
@@ -92,158 +117,271 @@ async def get_request(
await
asyncio
.
sleep
(
interval
)
async
def
send_request
(
backend
:
str
,
model
:
str
,
api_url
:
str
,
prompt
:
str
,
prompt_len
:
int
,
output_len
:
int
,
best_of
:
int
,
use_beam_search
:
bool
,
pbar
:
tqdm
)
->
None
:
request_start_time
=
time
.
perf_counter
()
headers
=
{
"User-Agent"
:
"Benchmark Client"
}
if
backend
==
"vllm"
:
pload
=
{
"prompt"
:
prompt
,
"n"
:
1
,
"best_of"
:
best_of
,
"use_beam_search"
:
use_beam_search
,
"temperature"
:
0.0
if
use_beam_search
else
1.0
,
"top_p"
:
1.0
,
"max_tokens"
:
output_len
,
"ignore_eos"
:
True
,
"stream"
:
False
,
}
if
model
is
not
None
:
pload
[
"model"
]
=
model
elif
backend
==
"tgi"
:
assert
not
use_beam_search
params
=
{
"best_of"
:
best_of
,
"max_new_tokens"
:
output_len
,
"do_sample"
:
True
,
}
pload
=
{
"inputs"
:
prompt
,
"parameters"
:
params
,
}
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
timeout
=
aiohttp
.
ClientTimeout
(
total
=
3
*
3600
)
async
with
aiohttp
.
ClientSession
(
timeout
=
timeout
)
as
session
:
while
True
:
async
with
session
.
post
(
api_url
,
headers
=
headers
,
json
=
pload
)
as
response
:
chunks
=
[]
async
for
chunk
,
_
in
response
.
content
.
iter_chunks
():
chunks
.
append
(
chunk
)
output
=
b
""
.
join
(
chunks
).
decode
(
"utf-8"
)
output
=
json
.
loads
(
output
)
def
calculate_metrics
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
outputs
:
List
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
BenchmarkMetrics
:
total_output
=
0
total_input
=
0
completed
=
0
per_token_latencies
=
[]
ttfts
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
output_len
=
len
(
tokenizer
.
encode
(
outputs
[
i
].
generated_text
))
total_output
+=
output_len
total_input
+=
input_requests
[
i
][
1
]
per_token_latencies
.
append
(
outputs
[
i
].
latency
/
output_len
)
ttfts
.
append
(
outputs
[
i
].
ttft
)
completed
+=
1
# Re-send the request if it failed.
if
"error"
not
in
output
:
break
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
total_output
=
total_output
,
request_throughput
=
completed
/
dur_s
,
input_throughput
=
total_input
/
dur_s
,
output_throughput
=
total_output
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
)
*
1000
,
p99_ttft_ms
=
np
.
percentile
(
ttfts
,
99
)
*
1000
,
mean_tpot_ms
=
np
.
mean
(
per_token_latencies
)
*
1000
,
median_tpot_ms
=
np
.
median
(
per_token_latencies
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
per_token_latencies
,
99
)
*
1000
,
)
request_end_time
=
time
.
perf_counter
()
request_latency
=
request_end_time
-
request_start_time
REQUEST_LATENCY
.
append
((
prompt_len
,
output_len
,
request_latency
))
pbar
.
update
(
1
)
return
metrics
async
def
benchmark
(
backend
:
str
,
model
:
str
,
api_url
:
str
,
model_id
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
best_of
:
int
,
use_beam_search
:
bool
,
request_rate
:
float
,
)
->
None
:
tasks
:
List
[
asyncio
.
Task
]
=
[]
pbar
=
tqdm
(
total
=
len
(
input_requests
))
disable_tqdm
:
bool
,
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
.
get
(
backend
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
prompt
,
prompt_len
,
output_len
=
request
task
=
asyncio
.
create_task
(
send_request
(
backend
,
model
,
api_url
,
prompt
,
prompt_len
,
output_len
,
best_of
,
use_beam_search
,
pbar
))
tasks
.
append
(
task
)
await
asyncio
.
gather
(
*
tasks
)
request_func_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
prompt
,
api_url
=
api_url
,
prompt_len
=
prompt_len
,
output_len
=
output_len
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
)
tasks
.
append
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
=
await
asyncio
.
gather
(
*
tasks
)
if
not
disable_tqdm
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
)
print
(
f
"Successful requests:
{
metrics
.
completed
}
"
)
print
(
f
"Benchmark duration:
{
benchmark_duration
:
2
f
}
s"
)
print
(
f
"Total input tokens:
{
metrics
.
total_input
}
"
)
print
(
f
"Total generated tokens:
{
metrics
.
total_output
}
"
)
print
(
f
"Request throughput:
{
metrics
.
request_throughput
:.
2
f
}
requests/s"
)
print
(
f
"Input token throughput:
{
metrics
.
input_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Output token throughput:
{
metrics
.
output_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Mean TTFT:
{
metrics
.
mean_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Median TTFT:
{
metrics
.
median_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TTFT:
{
metrics
.
p99_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Mean TPOT:
{
metrics
.
mean_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"Median TPOT:
{
metrics
.
median_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TPOT:
{
metrics
.
p99_tpot_ms
:.
2
f
}
ms"
)
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_inthroughput"
:
metrics
.
request_throughput
,
"input_throughput"
:
metrics
.
input_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"mean_ttft_ms"
:
metrics
.
mean_ttft_ms
,
"median_ttft_ms"
:
metrics
.
median_ttft_ms
,
"p99_ttft_ms"
:
metrics
.
p99_ttft_ms
,
"mean_tpot_ms"
:
metrics
.
mean_tpot_ms
,
"median_tpot_ms"
:
metrics
.
median_tpot_ms
,
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
}
return
result
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
api_url
=
f
"
{
args
.
protocol
}
://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
backend
=
args
.
backend
model_id
=
args
.
model
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
args
.
model
if
args
.
base_url
is
not
None
:
api_url
=
f
"
{
args
.
base_url
}{
args
.
endpoint
}
"
else
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
tokenizer
=
get_tokenizer
(
tokenizer_id
,
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
benchmark_start_time
=
time
.
perf_counter
()
asyncio
.
run
(
benchmark
(
args
.
backend
,
args
.
model
,
api_url
,
input_requests
,
args
.
best_of
,
args
.
use_beam_search
,
args
.
request_rate
))
benchmark_end_time
=
time
.
perf_counter
()
benchmark_time
=
benchmark_end_time
-
benchmark_start_time
print
(
f
"Total time:
{
benchmark_time
:.
2
f
}
s"
)
print
(
f
"Throughput:
{
args
.
num_prompts
/
benchmark_time
:.
2
f
}
requests/s"
)
# Compute the latency statistics.
avg_latency
=
np
.
mean
([
latency
for
_
,
_
,
latency
in
REQUEST_LATENCY
])
print
(
f
"Average latency:
{
avg_latency
:.
2
f
}
s"
)
avg_per_token_latency
=
np
.
mean
([
latency
/
(
prompt_len
+
output_len
)
for
prompt_len
,
output_len
,
latency
in
REQUEST_LATENCY
])
print
(
f
"Average latency per token:
{
avg_per_token_latency
:.
2
f
}
s"
)
avg_per_output_token_latency
=
np
.
mean
(
[
latency
/
output_len
for
_
,
output_len
,
latency
in
REQUEST_LATENCY
])
print
(
"Average latency per output token: "
f
"
{
avg_per_output_token_latency
:.
2
f
}
s"
)
benchmark_result
=
asyncio
.
run
(
benchmark
(
backend
=
backend
,
api_url
=
api_url
,
model_id
=
model_id
,
tokenizer
=
tokenizer
,
input_requests
=
input_requests
,
best_of
=
args
.
best_of
,
use_beam_search
=
args
.
use_beam_search
,
request_rate
=
args
.
request_rate
,
disable_tqdm
=
args
.
disable_tqdm
,
))
# Save config and results to json
if
args
.
save_result
:
result_json
=
{}
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"backend"
]
=
backend
result_json
[
"version"
]
=
args
.
version
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"best_of"
]
=
args
.
best_of
result_json
[
"use_beam_search"
]
=
args
.
use_beam_search
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
with
open
(
file_name
,
"w"
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
[
"vllm"
,
"tgi"
])
parser
.
add_argument
(
"--protocol"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--version"
,
type
=
str
,
default
=
"http"
,
choices
=
[
"http"
,
"https"
])
default
=
"N/A"
,
help
=
"Version of the serving backend/engine."
,
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
default
=
None
,
help
=
"Server or API base url if not using http host and port."
,
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/generate"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/generate"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
required
=
True
,
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--tokenizer"
,
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Name or path of the tokenizer."
)
parser
.
add_argument
(
"--best-of"
,
help
=
"Name of the model."
,
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default model tokenizer."
,
)
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
help
=
"Generates `best_of` sequences per prompt and "
"returns the best one."
)
"returns the best one."
,
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--request-rate"
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times."
)
"the request arrival times."
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code from huggingface"
,
)
parser
.
add_argument
(
"--disable-tqdm"
,
action
=
"store_true"
,
help
=
"Specify to disbale tqdm progress bar."
,
)
parser
.
add_argument
(
"--save-result"
,
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
args
=
parser
.
parse_args
()
main
(
args
)
benchmarks/launch_tgi_server.sh
View file @
a4211a4d
...
...
@@ -6,7 +6,7 @@ TOKENS=$2
docker run
--gpus
all
--shm-size
1g
-p
$PORT
:80
\
-v
$PWD
/data:/data
\
ghcr.io/huggingface/text-generation-inference:
0.8
\
ghcr.io/huggingface/text-generation-inference:
1.4.0
\
--model-id
$MODEL
\
--sharded
false
\
--max-input-length
1024
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment