Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
45b6ef65
Unverified
Commit
45b6ef65
authored
Mar 27, 2024
by
Roger Wang
Committed by
GitHub
Mar 27, 2024
Browse files
feat(benchmarks): Add Prefix Caching Benchmark to Serving Benchmark (#3277)
parent
19569314
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
897 additions
and
155 deletions
+897
-155
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+3
-2
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+121
-99
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+252
-51
benchmarks/sonnet.txt
benchmarks/sonnet.txt
+518
-0
pyproject.toml
pyproject.toml
+1
-1
tests/kernels/test_prefix_prefill.py
tests/kernels/test_prefix_prefill.py
+2
-2
No files found.
.buildkite/run-benchmarks.sh
View file @
45b6ef65
...
...
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout
600 bash
-c
'until curl localhost:8000/v1/models; do sleep 1; done'
||
exit
1
python3 benchmarks/benchmark_serving.py
\
--backend
openai
\
--dataset
./ShareGPT_V3_unfiltered_cleaned_split.json
\
--backend
vllm
\
--dataset-name
sharegpt
\
--dataset-path
./ShareGPT_V3_unfiltered_cleaned_split.json
\
--model
meta-llama/Llama-2-7b-chat-hf
\
--num-prompts
20
\
--endpoint
/v1/completions
\
...
...
benchmarks/backend_request_func.py
View file @
45b6ef65
import
json
import
os
import
sys
import
time
from
dataclasses
import
dataclass
from
typing
import
Optional
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
List
,
Optional
import
aiohttp
from
tqdm.asyncio
import
tqdm
...
...
@@ -26,8 +28,11 @@ class RequestFuncOutput:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0
ttft
:
float
=
0
ttft
:
float
=
0
# Time to first token
itl
:
List
[
float
]
=
field
(
default_factory
=
list
)
# List of inter-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
async
def
async_request_tgi
(
...
...
@@ -55,71 +60,38 @@ async def async_request_tgi(
ttft
=
0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
remove_prefix
(
data
.
decode
(
"utf-8"
),
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"generated_text"
]
output
.
success
=
True
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
output
.
success
=
False
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_vllm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate"
)
async
for
chunk
in
response
.
content
:
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"n"
:
1
,
"best_of"
:
request_func_input
.
best_of
,
"use_beam_search"
:
request_func_input
.
use_beam_search
,
"temperature"
:
0.0
if
request_func_input
.
use_beam_search
else
1.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"stream"
:
True
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
chunk
=
remove_prefix
(
chunk
.
decode
(
"utf-8"
),
"data:"
)
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
data
in
response
.
content
.
iter_any
():
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
# When streaming, '\0' is appended to the end of response.
body
=
data
.
decode
(
"utf-8"
).
strip
(
"
\0
"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text"
][
0
][
len
(
request_func_input
.
prompt
):]
output
.
success
=
True
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
output
.
generated_text
=
data
[
"generated_text"
]
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
...
...
@@ -146,26 +118,45 @@ async def async_request_trt_llm(
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0
ttft
=
0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
async
for
data
in
resp
.
content
.
iter_any
():
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk
in
response
.
content
:
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
chunk
=
remove_prefix
(
chunk
.
decode
(
"utf-8"
),
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
output
.
latency
=
time
.
perf_counter
()
-
st
body
=
remove_prefix
(
data
.
decode
(
"utf-8"
),
"data:"
)
output
.
generated_text
=
json
.
loads
(
body
)[
"text_output"
]
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
generated_text
=
json
.
loads
(
data
)[
"text_output"
]
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
)
:
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
...
...
@@ -181,35 +172,35 @@ async def async_request_deepspeed_mii(
assert
not
request_func_input
.
use_beam_search
payload
=
{
"prompts"
:
request_func_input
.
prompt
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"ignore_eos"
:
True
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temperature.
"prompt"
:
request_func_input
.
prompt
,
"max_tokens"
:
request_func_input
.
output_len
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temp.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
#
NOTE:
DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# https://github.com/microsoft/DeepSpeed-MII/pull/311
#
See
https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
resp
:
if
resp
.
status
==
200
:
parsed_resp
=
await
resp
.
json
()
json
=
payload
)
as
resp
onse
:
if
resp
onse
.
status
==
200
:
parsed_resp
=
await
resp
onse
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
output
.
generated_text
=
parsed_resp
[
0
][
"generated_
text"
]
output
.
generated_text
=
parsed_resp
[
"
text"
]
[
0
]
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
)
:
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
...
...
@@ -221,7 +212,9 @@ async def async_request_openai_completions(
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"v1/completions"
)
assert
api_url
.
endswith
(
"v1/completions"
),
"OpenAI Completions API URL must end with 'v1/completions'."
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
...
...
@@ -243,15 +236,12 @@ async def async_request_openai_completions(
generated_text
=
""
ttft
=
0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk
in
response
.
content
:
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
...
...
@@ -260,16 +250,33 @@ async def async_request_openai_completions(
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
body
=
json
.
loads
(
chunk
)
generated_text
+=
body
[
"choices"
][
0
][
"text"
]
data
=
json
.
loads
(
chunk
)
if
data
[
"choices"
][
0
][
"text"
]:
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# do not want to include as inter-token-latency
elif
data
.
get
(
"usage"
,
None
)
is
None
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
generated_text
+=
data
[
"choices"
][
0
][
"text"
]
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
):
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
...
...
@@ -283,7 +290,7 @@ async def async_request_openai_chat_completions(
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"v1/chat/completions"
),
"OpenAI Chat API URL must end with 'v1/chat/completions'."
),
"OpenAI Chat
Completions
API URL must end with 'v1/chat/completions'."
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
...
...
@@ -301,7 +308,7 @@ async def async_request_openai_chat_completions(
}
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
output
=
RequestFuncOutput
()
...
...
@@ -310,15 +317,12 @@ async def async_request_openai_chat_completions(
generated_text
=
""
ttft
=
0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk
in
response
.
content
:
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
chunk
=
chunk
.
strip
()
if
not
chunk
:
continue
...
...
@@ -327,18 +331,35 @@ async def async_request_openai_chat_completions(
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
body
=
json
.
loads
(
chunk
)
if
"content"
in
body
[
"choices"
][
0
][
"delta"
]:
generated_text
+=
body
[
"choices"
][
0
][
"delta"
][
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
"content"
in
data
[
"choices"
][
0
][
"delta"
]:
# First token
if
ttft
==
0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
data
[
"choices"
][
0
][
"delta"
][
"content"
]
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
error
=
response
.
reason
output
.
success
=
False
except
(
aiohttp
.
ClientOSError
,
aiohttp
.
ServerDisconnectedError
)
:
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
...
...
@@ -355,7 +376,8 @@ def remove_prefix(text: str, prefix: str) -> str:
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_vllm
,
"vllm"
:
async_request_openai_completions
,
"lmdeploy"
:
async_request_openai_completions
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
...
...
benchmarks/benchmark_serving.py
View file @
45b6ef65
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
(
vLLM
backend)
python -m vllm.entrypoints.api_server
\
vLLM
OpenAI API server
python -m vllm.entrypoints.
openai.
api_server
\
--model <your_model> --swap-space 16
\
--disable-log-requests
...
...
@@ -12,14 +12,19 @@ On the server side, run one of the following commands:
On the client side, run:
python benchmarks/benchmark_serving.py
\
--backend <backend>
\
--model <your_model> --dataset <target_dataset>
\
--request-rate <request_rate>
--model <your_model>
\
--dataset-name sharegpt
\
--dataset-path <path to dataset>
\
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
"""
import
argparse
import
asyncio
import
json
import
os
import
random
import
time
import
warnings
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
AsyncGenerator
,
List
,
Tuple
...
...
@@ -49,7 +54,7 @@ class BenchmarkMetrics:
p99_tpot_ms
:
float
def
sample_requests
(
def
sample_
sharegpt_
requests
(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
...
...
@@ -97,6 +102,73 @@ def sample_requests(
return
sampled_requests
def
sample_sonnet_requests
(
dataset_path
:
str
,
num_requests
:
int
,
input_len
:
int
,
output_len
:
int
,
prefix_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
int
]]:
assert
input_len
>
prefix_len
,
"input_len must be greater than prefix_len."
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
poem_lines
=
f
.
readlines
()
# Tokenize the poem lines.
poem_token_ids
=
tokenizer
(
poem_lines
).
input_ids
average_poem_len
=
sum
(
len
(
token_ids
)
for
token_ids
in
poem_token_ids
)
/
len
(
poem_token_ids
)
# Base prefix for all requests.
base_prompt
=
"Pick as many lines as you can from these poem lines:
\n
"
base_message
=
[{
"role"
:
"user"
,
"content"
:
base_prompt
,
}]
base_prompt_formatted
=
tokenizer
.
apply_chat_template
(
base_message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
base_prompt_offset
=
len
(
tokenizer
(
base_prompt_formatted
).
input_ids
)
assert
(
input_len
>
base_prompt_offset
),
f
"Please set 'args.input-len' higher than
{
base_prompt_offset
}
."
num_input_lines
=
round
(
(
input_len
-
base_prompt_offset
)
/
average_poem_len
)
# First approximately `prefix_len` number of tokens in the
# prompt are fixed poem lines.
assert
(
prefix_len
>
base_prompt_offset
),
f
"Please set 'args.prefix-len' higher than
{
base_prompt_offset
}
."
num_prefix_lines
=
round
(
(
prefix_len
-
base_prompt_offset
)
/
average_poem_len
)
prefix_lines
=
poem_lines
[:
num_prefix_lines
]
# Sample the rest of lines per request.
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
_
in
range
(
num_requests
):
sampled_lines
=
""
.
join
(
prefix_lines
+
random
.
sample
(
poem_lines
,
num_input_lines
-
num_prefix_lines
))
prompt
=
f
"
{
base_prompt
}{
sampled_lines
}
"
message
=
[
{
"role"
:
"user"
,
"content"
:
prompt
,
},
]
prompt_formatted
=
tokenizer
.
apply_chat_template
(
message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
sampled_requests
.
append
(
(
prompt
,
prompt_formatted
,
prompt_len
,
output_len
))
return
sampled_requests
async
def
get_request
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
request_rate
:
float
,
...
...
@@ -119,37 +191,42 @@ def calculate_metrics(
outputs
:
List
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
BenchmarkMetrics
:
tot
al_output
=
0
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]
:
actu
al_output
_lens
=
[]
total_input
=
0
completed
=
0
per_token_latencie
s
=
[]
tpot
s
=
[]
ttfts
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
output_len
=
len
(
tokenizer
.
encode
(
outputs
[
i
].
generated_text
))
tot
al_output
+=
output_len
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
)
.
input_ids
)
actu
al_output
_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
per_token_latencies
.
append
(
outputs
[
i
].
latency
/
output_len
)
if
output_len
>
1
:
tpots
.
append
(
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
1
))
ttfts
.
append
(
outputs
[
i
].
ttft
)
completed
+=
1
else
:
actual_output_lens
.
append
(
0
)
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
total_output
=
tot
al_output
,
total_output
=
sum
(
actu
al_output
_lens
)
,
request_throughput
=
completed
/
dur_s
,
input_throughput
=
total_input
/
dur_s
,
output_throughput
=
total_output
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
)
*
1000
,
p99_ttft_ms
=
np
.
percentile
(
ttfts
,
99
)
*
1000
,
mean_tpot_ms
=
np
.
mean
(
per_token_latencies
)
*
1000
,
median_tpot_ms
=
np
.
median
(
per_token_latencies
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
per_token_latencies
,
99
)
*
1000
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
1000
,
# ttfts is empty if streaming is not supported by backend
median_ttft_ms
=
np
.
median
(
ttfts
or
0
)
*
1000
,
p99_ttft_ms
=
np
.
percentile
(
ttfts
or
0
,
99
)
*
1000
,
mean_tpot_ms
=
np
.
mean
(
tpots
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
)
*
1000
,
p99_tpot_ms
=
np
.
percentile
(
tpots
,
99
)
*
1000
,
)
return
metrics
return
metrics
,
actual_output_lens
async
def
benchmark
(
...
...
@@ -189,40 +266,53 @@ async def benchmark(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
=
await
asyncio
.
gather
(
*
tasks
)
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
not
disable_tqdm
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
=
calculate_metrics
(
metrics
,
actual_output_lens
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
)
print
(
f
"Successful requests:
{
metrics
.
completed
}
"
)
print
(
f
"Benchmark duration:
{
benchmark_duration
:
2
f
}
s"
)
print
(
f
"Total input tokens:
{
metrics
.
total_input
}
"
)
print
(
f
"Total generated tokens:
{
metrics
.
total_output
}
"
)
print
(
f
"Request throughput:
{
metrics
.
request_throughput
:.
2
f
}
requests/s"
)
print
(
f
"Input token throughput:
{
metrics
.
input_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Output token throughput:
{
metrics
.
output_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Mean TTFT:
{
metrics
.
mean_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Median TTFT:
{
metrics
.
median_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TTFT:
{
metrics
.
p99_ttft_ms
:.
2
f
}
ms"
)
print
(
f
"Mean TPOT:
{
metrics
.
mean_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"Median TPOT:
{
metrics
.
median_tpot_ms
:.
2
f
}
ms"
)
print
(
f
"P99 TPOT:
{
metrics
.
p99_tpot_ms
:.
2
f
}
ms"
)
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Input token throughput (tok/s):"
,
metrics
.
input_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
print
(
"{s:{c}^{n}}"
.
format
(
s
=
'Time to First Token'
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Mean TTFT (ms):"
,
metrics
.
mean_ttft_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Median TTFT (ms):"
,
metrics
.
median_ttft_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"P99 TTFT (ms):"
,
metrics
.
p99_ttft_ms
))
print
(
"{s:{c}^{n}}"
.
format
(
s
=
'Time per Output Token (excl. 1st token)'
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Mean TPOT (ms):"
,
metrics
.
mean_tpot_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Median TPOT (ms):"
,
metrics
.
median_tpot_ms
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"P99 TPOT (ms):"
,
metrics
.
p99_tpot_ms
))
print
(
"="
*
50
)
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_
in
throughput"
:
metrics
.
request_throughput
,
"request_throughput"
:
metrics
.
request_throughput
,
"input_throughput"
:
metrics
.
input_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"mean_ttft_ms"
:
metrics
.
mean_ttft_ms
,
...
...
@@ -230,7 +320,13 @@ async def benchmark(
"p99_ttft_ms"
:
metrics
.
p99_ttft_ms
,
"mean_tpot_ms"
:
metrics
.
mean_tpot_ms
,
"median_tpot_ms"
:
metrics
.
median_tpot_ms
,
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
"p99_tpot_ms"
:
metrics
.
p99_tpot_ms
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
return
result
...
...
@@ -251,7 +347,58 @@ def main(args: argparse.Namespace):
tokenizer
=
get_tokenizer
(
tokenizer_id
,
trust_remote_code
=
args
.
trust_remote_code
)
input_requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
)
if
args
.
dataset
is
not
None
:
warnings
.
warn
(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs."
,
stacklevel
=
2
)
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
)
elif
args
.
dataset_name
==
"sharegpt"
:
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
)
elif
args
.
dataset_name
==
"sonnet"
:
# Do not format the prompt, pass to message directly
if
args
.
backend
==
"openai-chat"
:
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
input_len
,
output_len
=
args
.
output_len
,
prefix_len
=
args
.
prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt
,
prompt_len
,
output_len
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
in
input_requests
]
else
:
assert
(
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
),
"Tokenizer/model must have chat template for sonnet dataset."
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
input_len
,
output_len
=
args
.
output_len
,
prefix_len
=
args
.
prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt_formatted
,
prompt_len
,
output_len
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
in
input_requests
]
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
benchmark_result
=
asyncio
.
run
(
benchmark
(
...
...
@@ -274,13 +421,23 @@ def main(args: argparse.Namespace):
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"backend"
]
=
backend
result_json
[
"version"
]
=
args
.
version
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"best_of"
]
=
args
.
best_of
result_json
[
"use_beam_search"
]
=
args
.
use_beam_search
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Metadata
if
args
.
metadata
:
for
item
in
args
.
metadata
:
if
"="
in
item
:
kvstring
=
item
.
split
(
"="
)
result_json
[
kvstring
[
0
].
strip
()]
=
kvstring
[
1
].
strip
()
else
:
raise
ValueError
(
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
...
...
@@ -290,9 +447,9 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
(
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
)
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_dir
:
file_name
=
os
.
path
.
join
(
args
.
result_dir
,
file_name
)
with
open
(
file_name
,
"w"
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
...
...
@@ -306,12 +463,6 @@ if __name__ == "__main__":
default
=
"vllm"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--version"
,
type
=
str
,
default
=
"N/A"
,
help
=
"Version of the serving backend/engine."
,
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
...
...
@@ -323,12 +474,26 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/
generate
"
,
default
=
"/
v1/completions
"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset"
,
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the ShareGPT dataset, will be deprecated in the "
"next release."
,
)
parser
.
add_argument
(
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"sonnet"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
required
=
Tru
e
,
default
=
Non
e
,
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--model"
,
...
...
@@ -356,6 +521,27 @@ if __name__ == "__main__":
default
=
1000
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
...
...
@@ -381,6 +567,21 @@ if __name__ == "__main__":
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
parser
.
add_argument
(
"--metadata"
,
metavar
=
"KEY=VALUE"
,
nargs
=
"*"
,
help
=
"Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
"for metadata of this run to be saved in the result JSON file "
"for record keeping purposes."
,
)
parser
.
add_argument
(
"--result-dir"
,
type
=
str
,
default
=
None
,
help
=
"Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory."
,
)
args
=
parser
.
parse_args
()
main
(
args
)
benchmarks/sonnet.txt
0 → 100644
View file @
45b6ef65
This diff is collapsed.
Click to expand it.
pyproject.toml
View file @
45b6ef65
...
...
@@ -50,7 +50,7 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
[tool.codespell]
ignore-words-list
=
"dout, te, indicies"
skip
=
"./tests/prompts"
skip
=
"./tests/prompts
,./benchmarks/sonnet.txt
"
[tool.isort]
use_parentheses
=
true
...
...
tests/kernels/test_prefix_prefill.py
View file @
45b6ef65
...
...
@@ -36,8 +36,8 @@ def test_contexted_kv_attention(
torch
.
cuda
.
manual_seed
(
0
)
torch
.
set_default_device
(
device
)
# Need this, otherwise when we capture the graph the process
for GPU 1 would
# run on both GPU0 and GPU1 and things would hang
# Need this, otherwise when we capture the graph the process
#
for GPU 1 would
run on both GPU0 and GPU1 and things would hang
#
# see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
torch
.
cuda
.
set_device
(
device
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment