Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
vllm_test_tools
Commits
49c10c0c
Commit
49c10c0c
authored
Apr 16, 2025
by
jerrrrry
Browse files
Initial commit
parents
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
4544 additions
and
0 deletions
+4544
-0
062/backend_request_func.py
062/backend_request_func.py
+440
-0
062/benchmark_servein_0.6.2.py
062/benchmark_servein_0.6.2.py
+976
-0
062/benchmark_throughput_0.6.2.py
062/benchmark_throughput_0.6.2.py
+732
-0
072/backend_request_func.py
072/backend_request_func.py
+485
-0
072/benchmark_servein_0.7.2.py
072/benchmark_servein_0.7.2.py
+1242
-0
072/benchmark_throughput_0.7.2.py
072/benchmark_throughput_0.7.2.py
+669
-0
No files found.
062/backend_request_func.py
0 → 100644
View file @
49c10c0c
import
json
import
os
import
sys
import
time
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
List
,
Optional
,
Union
import
aiohttp
import
huggingface_hub.constants
from
tqdm.asyncio
import
tqdm
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
best_of
:
int
=
1
use_beam_search
:
bool
=
False
logprobs
:
Optional
[
int
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0.0
ttft
:
float
=
0.0
# Time to first token
itl
:
List
[
float
]
=
field
(
default_factory
=
list
)
# List of inter-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
params
=
{
"best_of"
:
request_func_input
.
best_of
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
#NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if
chunk_bytes
.
startswith
(
":"
):
continue
chunk
=
remove_prefix
(
chunk_bytes
,
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
output
.
generated_text
=
data
[
"generated_text"
]
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
assert
request_func_input
.
best_of
==
1
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
if
request_func_input
.
ignore_eos
:
payload
[
"min_length"
]
=
request_func_input
.
output_len
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
),
"data:"
)
data
=
json
.
loads
(
chunk
)
output
.
generated_text
+=
data
[
"text_output"
]
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
request_func_input
.
best_of
==
1
assert
not
request_func_input
.
use_beam_search
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"max_tokens"
:
request_func_input
.
output_len
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temp.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
parsed_resp
=
await
response
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
output
.
generated_text
=
parsed_resp
[
"text"
][
0
]
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"completions"
,
"profile"
)
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
payload
=
{
"model"
:
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"best_of"
:
request_func_input
.
best_of
,
"max_tokens"
:
request_func_input
.
output_len
,
"logprobs"
:
request_func_input
.
logprobs
,
"stream"
:
True
,
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
),
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
data
=
json
.
loads
(
chunk
)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if
data
[
"choices"
][
0
][
"text"
]:
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
generated_text
+=
data
[
"choices"
][
0
][
"text"
]
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_chat_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"chat/completions"
),
"OpenAI Chat Completions API URL must end with 'chat/completions'."
async
with
aiohttp
.
ClientSession
(
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
not
request_func_input
.
use_beam_search
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
if
request_func_input
.
multi_modal_content
:
content
.
append
(
request_func_input
.
multi_modal_content
)
payload
=
{
"model"
:
request_func_input
.
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
content
},
],
"temperature"
:
0.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
),
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
else
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
delta
=
data
[
"choices"
][
0
][
"delta"
]
if
delta
.
get
(
"content"
,
None
):
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
delta
[
"content"
]
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
latency
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def
remove_prefix
(
text
:
str
,
prefix
:
str
)
->
str
:
if
text
.
startswith
(
prefix
):
return
text
[
len
(
prefix
):]
return
text
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
model_path
=
snapshot_download
(
model_id
=
pretrained_model_name_or_path
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
])
return
model_path
return
pretrained_model_name_or_path
def
get_tokenizer
(
pretrained_model_name_or_path
:
str
,
trust_remote_code
:
bool
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
if
pretrained_model_name_or_path
is
not
None
and
not
os
.
path
.
exists
(
pretrained_model_name_or_path
):
pretrained_model_name_or_path
=
get_model
(
pretrained_model_name_or_path
)
return
AutoTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
trust_remote_code
=
trust_remote_code
)
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_openai_completions
,
"lmdeploy"
:
async_request_openai_completions
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
}
062/benchmark_servein_0.6.2.py
0 → 100644
View file @
49c10c0c
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model>
\
--swap-space 16
\
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py
\
--backend <backend>
\
--model <your_model>
\
--dataset-name sharegpt
\
--dataset-path <path to dataset>
\
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
"""
import
argparse
import
asyncio
import
base64
import
io
import
json
import
os
import
random
import
time
import
warnings
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
Any
,
AsyncGenerator
,
Collection
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
RequestFuncOutput
)
from
datasets
import
load_dataset
from
PIL.Image
import
Image
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
try
:
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
except
ImportError
:
from
backend_request_func
import
get_tokenizer
try
:
from
vllm.utils
import
FlexibleArgumentParser
except
ImportError
:
from
argparse
import
ArgumentParser
as
FlexibleArgumentParser
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
output_throughput
:
float
total_token_throughput
:
float
mean_ttft_ms
:
float
median_ttft_ms
:
float
std_ttft_ms
:
float
percentiles_ttft_ms
:
List
[
Tuple
[
float
,
float
]]
mean_tpot_ms
:
float
median_tpot_ms
:
float
std_tpot_ms
:
float
percentiles_tpot_ms
:
List
[
Tuple
[
float
,
float
]]
mean_itl_ms
:
float
median_itl_ms
:
float
std_itl_ms
:
float
percentiles_itl_ms
:
List
[
Tuple
[
float
,
float
]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms
:
float
median_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
List
[
Tuple
[
float
,
float
]]
def
sample_sharegpt_requests
(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
int
,
int
,
None
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Only keep the first two turns of each conversation.
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
dataset
[
i
][
0
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
dataset
[
i
][
1
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
,
None
))
return
filtered_dataset
def
sample_sonnet_requests
(
dataset_path
:
str
,
num_requests
:
int
,
input_len
:
int
,
output_len
:
int
,
prefix_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
int
,
None
]]:
assert
(
input_len
>
prefix_len
),
"'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
poem_lines
=
f
.
readlines
()
# Tokenize the poem lines.
poem_token_ids
=
tokenizer
(
poem_lines
).
input_ids
average_poem_len
=
sum
(
len
(
token_ids
)
for
token_ids
in
poem_token_ids
)
/
len
(
poem_token_ids
)
# Base prefix for all requests.
base_prompt
=
"Pick as many lines as you can from these poem lines:
\n
"
base_message
=
[{
"role"
:
"user"
,
"content"
:
base_prompt
,
}]
base_prompt_formatted
=
tokenizer
.
apply_chat_template
(
base_message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
base_prompt_offset
=
len
(
tokenizer
(
base_prompt_formatted
).
input_ids
)
assert
(
input_len
>
base_prompt_offset
),
f
"Please set 'args.sonnet-input-len' higher than
{
base_prompt_offset
}
."
num_input_lines
=
round
(
(
input_len
-
base_prompt_offset
)
/
average_poem_len
)
# First approximately `prefix_len` number of tokens in the
# prompt are fixed poem lines.
assert
(
prefix_len
>
base_prompt_offset
),
f
"Please set 'args.sonnet-prefix-len' higher than
{
base_prompt_offset
}
."
num_prefix_lines
=
round
(
(
prefix_len
-
base_prompt_offset
)
/
average_poem_len
)
prefix_lines
=
poem_lines
[:
num_prefix_lines
]
# Sample the rest of lines per request.
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
_
in
range
(
num_requests
):
sampled_lines
=
""
.
join
(
prefix_lines
+
random
.
sample
(
poem_lines
,
num_input_lines
-
num_prefix_lines
))
prompt
=
f
"
{
base_prompt
}{
sampled_lines
}
"
message
=
[
{
"role"
:
"user"
,
"content"
:
prompt
,
},
]
prompt_formatted
=
tokenizer
.
apply_chat_template
(
message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
sampled_requests
.
append
(
(
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
None
))
return
sampled_requests
def
sample_hf_requests
(
dataset_path
:
str
,
dataset_subset
:
str
,
dataset_split
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
dataset
=
load_dataset
(
dataset_path
,
name
=
dataset_subset
,
split
=
dataset_split
,
streaming
=
True
)
assert
"conversations"
in
dataset
.
features
,
(
"HF Dataset must have 'conversations' column."
)
filtered_dataset
=
dataset
.
shuffle
().
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
filtered_dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
data
[
"conversations"
][
1
][
"value"
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
if
"image"
in
data
and
isinstance
(
data
[
"image"
],
Image
):
image
:
Image
=
data
[
"image"
]
image
=
image
.
convert
(
"RGB"
)
image_data
=
io
.
BytesIO
()
image
.
save
(
image_data
,
format
=
'JPEG'
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
mm_content
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
}
else
:
mm_content
=
None
sampled_requests
.
append
((
prompt
,
prompt_len
,
output_len
,
mm_content
))
return
sampled_requests
def
sample_random_requests
(
prefix_len
:
int
,
input_len
:
int
,
output_len
:
int
,
num_prompts
:
int
,
range_ratio
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
prefix_token_ids
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
prefix_len
).
tolist
()
input_lens
=
np
.
random
.
randint
(
int
(
input_len
*
range_ratio
),
input_len
+
1
,
size
=
num_prompts
,
)
output_lens
=
np
.
random
.
randint
(
int
(
output_len
*
range_ratio
),
output_len
+
1
,
size
=
num_prompts
,
)
offsets
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
num_prompts
)
input_requests
=
[]
for
i
in
range
(
num_prompts
):
prompt
=
tokenizer
.
decode
(
prefix_token_ids
+
[(
offsets
[
i
]
+
i
+
j
)
%
tokenizer
.
vocab_size
for
j
in
range
(
input_lens
[
i
])])
input_requests
.
append
((
prompt
,
int
(
prefix_len
+
input_lens
[
i
]),
int
(
output_lens
[
i
]),
None
))
return
input_requests
async
def
get_request
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
request_rate
:
float
,
)
->
AsyncGenerator
[
Tuple
[
str
,
int
,
int
],
None
]:
input_requests
=
iter
(
input_requests
)
for
request
in
input_requests
:
yield
request
if
request_rate
==
float
(
"inf"
):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the exponential distribution.
interval
=
np
.
random
.
exponential
(
1.0
/
request_rate
)
# The next request will be sent after the interval.
await
asyncio
.
sleep
(
interval
)
def
calculate_metrics
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
outputs
:
List
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
float
],
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
actual_output_lens
:
List
[
int
]
=
[]
total_input
=
0
completed
=
0
itls
:
List
[
float
]
=
[]
tpots
:
List
[
float
]
=
[]
ttfts
:
List
[
float
]
=
[]
e2els
:
List
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
# We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together
# Note : this may inflate the output token count slightly
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
).
input_ids
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
if
output_len
>
1
:
tpots
.
append
(
(
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
)
/
(
output_len
-
1
))
itls
+=
outputs
[
i
].
itl
ttfts
.
append
(
outputs
[
i
].
ttft
)
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
else
:
actual_output_lens
.
append
(
0
)
if
completed
==
0
:
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments."
,
stacklevel
=
2
)
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
total_output
=
sum
(
actual_output_lens
),
request_throughput
=
completed
/
dur_s
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
total_token_throughput
=
(
total_input
+
sum
(
actual_output_lens
))
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
1000
,
# ttfts is empty if streaming is not supported by backend
std_ttft_ms
=
np
.
std
(
ttfts
or
0
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
or
0
)
*
1000
,
percentiles_ttft_ms
=
[(
p
,
np
.
percentile
(
ttfts
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_tpot_ms
=
np
.
mean
(
tpots
or
0
)
*
1000
,
std_tpot_ms
=
np
.
std
(
tpots
or
0
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
or
0
)
*
1000
,
percentiles_tpot_ms
=
[(
p
,
np
.
percentile
(
tpots
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_itl_ms
=
np
.
mean
(
itls
or
0
)
*
1000
,
std_itl_ms
=
np
.
std
(
itls
or
0
)
*
1000
,
median_itl_ms
=
np
.
median
(
itls
or
0
)
*
1000
,
percentiles_itl_ms
=
[(
p
,
np
.
percentile
(
itls
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_e2el_ms
=
np
.
median
(
e2els
or
0
)
*
1000
,
std_e2el_ms
=
np
.
std
(
e2els
or
0
)
*
1000
,
median_e2el_ms
=
np
.
mean
(
e2els
or
0
)
*
1000
,
percentiles_e2el_ms
=
[(
p
,
np
.
percentile
(
e2els
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
)
return
metrics
,
actual_output_lens
async
def
benchmark
(
backend
:
str
,
api_url
:
str
,
base_url
:
str
,
model_id
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
logprobs
:
Optional
[
int
],
best_of
:
int
,
use_beam_search
:
bool
,
request_rate
:
float
,
disable_tqdm
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
str
],
ignore_eos
:
bool
,
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
print
(
"Starting initial single prompt test run..."
)
test_prompt
,
test_prompt_len
,
test_output_len
,
test_mm_content
=
(
input_requests
[
0
])
if
backend
!=
"openai-chat"
and
test_mm_content
is
not
None
:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' backend."
)
test_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
api_url
=
api_url
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
ignore_eos
=
ignore_eos
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
test_mm_content
,
)
test_output
=
await
request_func
(
request_func_input
=
test_input
)
if
not
test_output
.
success
:
raise
ValueError
(
"Initial test run failed - Please make sure benchmark arguments "
f
"are correctly specified. Error:
{
test_output
.
error
}
"
)
else
:
print
(
"Initial test run completed. Starting main benchmark run..."
)
if
profile
:
print
(
"Starting profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/start_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
test_mm_content
,
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler started"
)
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
prompt
,
prompt_len
,
output_len
,
mm_content
=
request
request_func_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
prompt
,
api_url
=
api_url
,
prompt_len
=
prompt_len
,
output_len
=
output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
)
tasks
.
append
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
print
(
"Stopping profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/stop_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
use_beam_search
=
use_beam_search
,
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler stopped"
)
if
pbar
is
not
None
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
,
actual_output_lens
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
selected_percentile_metrics
=
selected_percentile_metrics
,
selected_percentiles
=
selected_percentiles
,
)
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
metrics
.
total_token_throughput
))
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_throughput"
:
metrics
.
request_throughput
,
"output_throughput"
:
metrics
.
output_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
def
process_one_metric
(
# E.g., "ttft"
metric_attribute_name
:
str
,
# E.g., "TTFT"
metric_name
:
str
,
# E.g., "Time to First Token"
metric_header
:
str
,
):
# This function print and add statistics of the specified
# metric.
if
metric_attribute_name
not
in
selected_percentile_metrics
:
return
print
(
"{s:{c}^{n}}"
.
format
(
s
=
metric_header
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Mean
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Median
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)))
result
[
f
"mean_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)
result
[
f
"median_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)
result
[
f
"std_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"std_
{
metric_attribute_name
}
_ms"
)
for
p
,
value
in
getattr
(
metrics
,
f
"percentiles_
{
metric_attribute_name
}
_ms"
):
p_word
=
str
(
int
(
p
))
if
int
(
p
)
==
p
else
str
(
p
)
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"P
{
p_word
}
{
metric_name
}
(ms):"
,
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
print
(
"="
*
50
)
return
result
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
backend
=
args
.
backend
model_id
=
args
.
model
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
args
.
model
if
args
.
base_url
is
not
None
:
api_url
=
f
"
{
args
.
base_url
}{
args
.
endpoint
}
"
base_url
=
f
"
{
args
.
base_url
}
"
else
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
base_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
"
tokenizer
=
get_tokenizer
(
tokenizer_id
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
not
None
:
warnings
.
warn
(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs."
,
stacklevel
=
2
)
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
elif
args
.
dataset_name
==
"sharegpt"
:
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
elif
args
.
dataset_name
==
"sonnet"
:
# Do not format the prompt, pass to message directly
if
args
.
backend
==
"openai-chat"
:
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
_
in
input_requests
]
else
:
assert
(
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
),
"Tokenizer/model must have chat template for sonnet dataset."
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt_formatted
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
_
in
input_requests
]
elif
args
.
dataset_name
==
"hf"
:
input_requests
=
sample_hf_requests
(
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
dataset_split
=
args
.
hf_split
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
hf_output_len
,
)
elif
args
.
dataset_name
==
"random"
:
input_requests
=
sample_random_requests
(
prefix_len
=
args
.
random_prefix_len
,
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
num_prompts
=
args
.
num_prompts
,
range_ratio
=
args
.
random_range_ratio
,
tokenizer
=
tokenizer
,
)
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
benchmark_result
=
asyncio
.
run
(
benchmark
(
backend
=
backend
,
api_url
=
api_url
,
base_url
=
base_url
,
model_id
=
model_id
,
tokenizer
=
tokenizer
,
input_requests
=
input_requests
,
logprobs
=
args
.
logprobs
,
best_of
=
args
.
best_of
,
use_beam_search
=
args
.
use_beam_search
,
request_rate
=
args
.
request_rate
,
disable_tqdm
=
args
.
disable_tqdm
,
profile
=
args
.
profile
,
ignore_eos
=
args
.
ignore_eos
,
selected_percentile_metrics
=
args
.
percentile_metrics
.
split
(
","
),
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
))
# Save config and results to json
if
args
.
save_result
:
result_json
:
Dict
[
str
,
Any
]
=
{}
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"backend"
]
=
backend
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"best_of"
]
=
args
.
best_of
result_json
[
"use_beam_search"
]
=
args
.
use_beam_search
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Metadata
if
args
.
metadata
:
for
item
in
args
.
metadata
:
if
"="
in
item
:
kvstring
=
item
.
split
(
"="
)
result_json
[
kvstring
[
0
].
strip
()]
=
kvstring
[
1
].
strip
()
else
:
raise
ValueError
(
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_filename
:
file_name
=
args
.
result_filename
if
args
.
result_dir
:
file_name
=
os
.
path
.
join
(
args
.
result_dir
,
file_name
)
with
open
(
file_name
,
"w"
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
default
=
None
,
help
=
"Server or API base url if not using http host and port."
,
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/v1/completions"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the ShareGPT dataset, will be deprecated in the "
"next release."
,
)
parser
.
add_argument
(
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"sonnet"
,
"random"
,
"hf"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Name of the model."
,
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
)
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
help
=
"Generates `best_of` sequences per prompt and "
"returns the best one."
,
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--logprobs"
,
type
=
int
,
default
=
None
,
help
=
(
"Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times."
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code from huggingface"
,
)
parser
.
add_argument
(
"--disable-tqdm"
,
action
=
"store_true"
,
help
=
"Specify to disable tqdm progress bar."
,
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler."
,
)
parser
.
add_argument
(
"--save-result"
,
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
parser
.
add_argument
(
"--metadata"
,
metavar
=
"KEY=VALUE"
,
nargs
=
"*"
,
help
=
"Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
"for metadata of this run to be saved in the result JSON file "
"for record keeping purposes."
,
)
parser
.
add_argument
(
"--result-dir"
,
type
=
str
,
default
=
None
,
help
=
"Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory."
,
)
parser
.
add_argument
(
"--result-filename"
,
type
=
str
,
default
=
None
,
help
=
"Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format."
,
)
parser
.
add_argument
(
"--ignore-eos"
,
action
=
"store_true"
,
help
=
"Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi."
)
parser
.
add_argument
(
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
"Default value is
\"
ttft,tpot,itl
\"
."
)
parser
.
add_argument
(
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
# group for dataset specific arguments
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
sonnet_group
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
sharegpt_group
=
parser
.
add_argument_group
(
"sharegpt dataset options"
)
sharegpt_group
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
random_group
=
parser
.
add_argument_group
(
"random dataset options"
)
random_group
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
1.0
,
help
=
"Range of sampled ratio of input/output length, "
"used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-prefix-len"
,
type
=
int
,
default
=
0
,
help
=
"Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio)."
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
type
=
str
,
default
=
None
,
help
=
"Subset of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-split"
,
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output lengths "
"from the sampled HF dataset."
,
)
args
=
parser
.
parse_args
()
main
(
args
)
062/benchmark_throughput_0.6.2.py
0 → 100644
View file @
49c10c0c
"""Benchmark offline inference throughput."""
import
argparse
import
json
import
random
import
time
from
typing
import
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptInputs
from
vllm.engine.arg_utils
import
DEVICE_OPTIONS
,
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
def
sample_requests
(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
],
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Only keep the first two turns of each conversation.
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
dataset
[
i
][
0
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
dataset
[
i
][
1
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
return
filtered_dataset
def
run_vllm
(
warmup_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
requests_json
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
str
,
quantization
:
Optional
[
str
],
tensor_parallel_size
:
int
,
seed
:
int
,
n
:
int
,
use_beam_search
:
bool
,
trust_remote_code
:
bool
,
dtype
:
str
,
max_model_len
:
Optional
[
int
],
enforce_eager
:
bool
,
kv_cache_dtype
:
str
,
quantization_param_path
:
Optional
[
str
],
device
:
str
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
num_scheduler_steps
:
int
=
1
,
use_v2_block_manager
:
bool
=
False
,
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
disable_async_output_proc
:
bool
=
False
,
use_new_beam_search_impl
:
bool
=
False
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
model
=
model
,
tokenizer
=
tokenizer
,
quantization
=
quantization
,
tensor_parallel_size
=
tensor_parallel_size
,
seed
=
seed
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
quantization_param_path
=
quantization_param_path
,
device
=
device
,
enable_prefix_caching
=
enable_prefix_caching
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
load_format
=
load_format
,
num_scheduler_steps
=
num_scheduler_steps
,
use_v2_block_manager
=
use_v2_block_manager
,
disable_async_output_proc
=
disable_async_output_proc
,
)
# warmup
warmup_prompts
=
[]
warmup_sampling_params
=
[]
for
prompt
,
_
,
output_len
in
warmup_requests
:
warmup_prompts
.
append
(
prompt
)
warmup_sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
warmup_prompts
,
warmup_sampling_params
,
use_tqdm
=
True
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
str
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
prompt
,
_
,
output_len
in
requests
:
prompts
.
append
(
prompt
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
if
not
use_new_beam_search_impl
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
use_beam_search
prompts
=
[
prompt
for
prompt
,
_
,
_
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
prompt
,
input_len
,
_output_len
in
requests
:
assert
_output_len
==
output_len
start
=
time
.
perf_counter
()
llm
.
beam_search
(
prompts
,
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
)
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
# total_num_tokens = sum(request.prompt_len + request.expected_output_len
# for request in requests)
# total_output_tokens = sum(request.expected_output_len
# for request in requests)
total_num_tokens
=
sum
(
prompt_len
+
output_len
for
_
,
prompt_len
,
output_len
in
requests
)
total_output_tokens
=
sum
(
output_len
for
_
,
prompt_len
,
output_len
in
requests
)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]:
.
5
f
}
s"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]:
.
5
f
}
s"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]:
.
5
f
}
s"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]:
.
5
f
}
s"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]:
.
5
f
}
s"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]:
.
5
f
}
s"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
str
,
quantization
:
Optional
[
str
],
tensor_parallel_size
:
int
,
seed
:
int
,
n
:
int
,
use_beam_search
:
bool
,
trust_remote_code
:
bool
,
dtype
:
str
,
max_model_len
:
Optional
[
int
],
enforce_eager
:
bool
,
kv_cache_dtype
:
str
,
quantization_param_path
:
Optional
[
str
],
device
:
str
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
num_scheduler_steps
:
int
=
1
,
use_v2_block_manager
:
bool
=
False
,
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
disable_async_output_proc
:
bool
=
False
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
engine_args
=
AsyncEngineArgs
(
model
=
model
,
tokenizer
=
tokenizer
,
quantization
=
quantization
,
tensor_parallel_size
=
tensor_parallel_size
,
seed
=
seed
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
quantization_param_path
=
quantization_param_path
,
device
=
device
,
enable_prefix_caching
=
enable_prefix_caching
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
load_format
=
load_format
,
num_scheduler_steps
=
num_scheduler_steps
,
use_v2_block_manager
=
use_v2_block_manager
,
disable_async_output_proc
=
disable_async_output_proc
,
worker_use_ray
=
False
,
disable_log_requests
=
True
,
)
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
str
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
prompt
,
_
,
output_len
in
requests
:
prompts
.
append
(
prompt
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
use_beam_search
:
bool
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
assert
not
use_beam_search
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
not
use_beam_search
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
prompt
for
prompt
,
_
,
_
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
warmup_prompt
=
"hi"
*
10
warmup_requests
=
[(
warmup_prompt
,
10
,
10
)
for
_
in
range
(
1
)]
if
args
.
dataset
is
None
:
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
# Synthesize a prompt with the given input length.
prompt
=
"hi"
*
(
ELEinput
-
1
)
requests
=
[(
prompt
,
ELEinput
,
ELEoutput
)
for
_
in
range
(
ELEprompt
)]
print
(
"type(requests):"
,
type
(
requests
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
,
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
run_args
=
[
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
else
:
run_args
=
[
warmup_requests
,
requests_json
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
.
append
(
args
.
disable_frontend_multiprocessing
)
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
*
run_args
))
else
:
info_json
=
run_vllm
(
*
run_args
,
args
.
use_new_beam_search_impl
)
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
use_beam_search
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# total_num_tokens = sum(prompt_len + output_len
# for _, prompt_len, output_len in requests)
# if args.dataset is None:
# total_out_tokens = args.output_len * args.num_prompts
# else:
# total_out_tokens = sum(output_len for _, _, output_len in requests)
# print(f"Latency: {elapsed_time:.2f} s")
# print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
# f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"facebook/opt-125m"
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
'-q'
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
None
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"-tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--use-new-beam-search-impl"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
nargs
=
"*"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
'auto'
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
0.9
,
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"enforce eager execution"
)
parser
.
add_argument
(
'--kv-cache-dtype'
,
type
=
str
,
choices
=
[
'auto'
,
'fp8'
,
'fp8_e5m2'
,
'fp8_e4m3'
],
default
=
"auto"
,
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (hcu) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (hcu), FP8_E4M3 is '
'instead supported for common inference criteria.'
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"auto"
,
choices
=
DEVICE_OPTIONS
,
help
=
'device type for vLLM execution'
)
parser
.
add_argument
(
"--num-scheduler-steps"
,
type
=
int
,
default
=
1
,
help
=
"Maximum number of forward steps per scheduler call."
)
parser
.
add_argument
(
"--use-v2-block-manager"
,
action
=
'store_true'
,
help
=
"Enable block manager v2."
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
action
=
'store_true'
,
help
=
"Enable automatic prefix caching for vLLM backend."
)
parser
.
add_argument
(
"--enable-chunked-prefill"
,
action
=
'store_true'
,
help
=
"enable chunked prefill for vLLM backend."
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
None
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
,
'tensorizer'
,
'bitsandbytes'
],
help
=
'The format of the model weights to load.
\n\n
'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.
\n
'
'* "pt" will load the weights in the pytorch bin format.
\n
'
'* "safetensors" will load the weights in the safetensors format.
\n
'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.
\n
'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.
\n
'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
parser
.
add_argument
(
"--disable-async-output-proc"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable async output processor for vLLM backend."
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
use_beam_search
:
raise
ValueError
(
"Beam search is not supported for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
main
(
args
)
\ No newline at end of file
072/backend_request_func.py
0 → 100644
View file @
49c10c0c
# SPDX-License-Identifier: Apache-2.0
import
json
import
os
import
sys
import
time
import
traceback
from
dataclasses
import
dataclass
,
field
from
typing
import
List
,
Optional
,
Union
import
aiohttp
import
huggingface_hub.constants
from
tqdm.asyncio
import
tqdm
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
AIOHTTP_TIMEOUT
=
aiohttp
.
ClientTimeout
(
total
=
6
*
60
*
60
)
@
dataclass
class
RequestFuncInput
:
prompt
:
str
api_url
:
str
prompt_len
:
int
output_len
:
int
model
:
str
model_name
:
Optional
[
str
]
=
None
best_of
:
int
=
1
logprobs
:
Optional
[
int
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
@
dataclass
class
RequestFuncOutput
:
generated_text
:
str
=
""
success
:
bool
=
False
latency
:
float
=
0.0
output_tokens
:
int
=
0
ttft
:
float
=
0.0
# Time to first token
itl
:
List
[
float
]
=
field
(
default_factory
=
list
)
# List of inter-token latencies
tpot
:
float
=
0.0
# avg next-token latencies
prompt_len
:
int
=
0
error
:
str
=
""
async
def
async_request_tgi
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
params
=
{
"best_of"
:
request_func_input
.
best_of
,
"max_new_tokens"
:
request_func_input
.
output_len
,
"do_sample"
:
True
,
"temperature"
:
0.01
,
# TGI does not accept 0.0 temperature.
"top_p"
:
0.99
,
# TGI does not accept 1.0 top_p.
"truncate"
:
request_func_input
.
prompt_len
,
# TGI does not accept ignore_eos flag.
}
payload
=
{
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk_bytes
=
chunk_bytes
.
decode
(
"utf-8"
)
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if
chunk_bytes
.
startswith
(
":"
):
continue
chunk
=
chunk_bytes
.
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
output
.
generated_text
=
data
[
"generated_text"
]
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_trt_llm
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"generate_stream"
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
request_func_input
.
best_of
==
1
payload
=
{
"accumulate_tokens"
:
True
,
"text_input"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"top_p"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
}
if
request_func_input
.
ignore_eos
:
payload
[
"min_length"
]
=
request_func_input
.
output_len
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
output
.
generated_text
+=
data
[
"text_output"
]
timestamp
=
time
.
perf_counter
()
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
output
.
latency
=
most_recent_timestamp
-
st
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_deepspeed_mii
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
assert
request_func_input
.
best_of
==
1
payload
=
{
"prompt"
:
request_func_input
.
prompt
,
"max_tokens"
:
request_func_input
.
output_len
,
"temperature"
:
0.01
,
# deepspeed-mii does not accept 0.0 temp.
"top_p"
:
1.0
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output
.
ttft
=
0
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
request_func_input
.
api_url
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
parsed_resp
=
await
response
.
json
()
output
.
latency
=
time
.
perf_counter
()
-
st
output
.
generated_text
=
parsed_resp
[
"text"
][
0
]
output
.
success
=
True
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
(
"completions"
,
"profile"
)
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"best_of"
:
request_func_input
.
best_of
,
"max_tokens"
:
request_func_input
.
output_len
,
"logprobs"
:
request_func_input
.
logprobs
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
first_chunk_received
=
False
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
data
=
json
.
loads
(
chunk
)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if
choices
:
=
data
.
get
(
"choices"
):
# Note that text could be empty here
# e.g. for special tokens
text
=
choices
[
0
].
get
(
"text"
)
timestamp
=
time
.
perf_counter
()
# First token
if
not
first_chunk_received
:
first_chunk_received
=
True
ttft
=
time
.
perf_counter
()
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
most_recent_timestamp
=
timestamp
generated_text
+=
text
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
if
first_chunk_received
:
output
.
success
=
True
else
:
output
.
success
=
False
output
.
error
=
(
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!"
)
output
.
generated_text
=
generated_text
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_chat_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"chat/completions"
),
"OpenAI Chat Completions API URL must end with 'chat/completions'."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
if
request_func_input
.
multi_modal_content
:
content
.
append
(
request_func_input
.
multi_modal_content
)
payload
=
{
"model"
:
request_func_input
.
model_name
\
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
content
},
],
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
model_path
=
snapshot_download
(
model_id
=
pretrained_model_name_or_path
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
])
return
model_path
return
pretrained_model_name_or_path
def
get_tokenizer
(
pretrained_model_name_or_path
:
str
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
**
kwargs
,
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
if
pretrained_model_name_or_path
is
not
None
and
not
os
.
path
.
exists
(
pretrained_model_name_or_path
):
pretrained_model_name_or_path
=
get_model
(
pretrained_model_name_or_path
)
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
kwargs
[
"use_fast"
]
=
False
if
tokenizer_mode
==
"mistral"
:
try
:
from
vllm.transformers_utils.tokenizer
import
MistralTokenizer
except
ImportError
as
e
:
raise
ImportError
(
"MistralTokenizer requires vllm package.
\n
"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode."
)
from
e
return
MistralTokenizer
.
from_pretrained
(
str
(
pretrained_model_name_or_path
))
else
:
return
AutoTokenizer
.
from_pretrained
(
pretrained_model_name_or_path
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
ASYNC_REQUEST_FUNCS
=
{
"tgi"
:
async_request_tgi
,
"vllm"
:
async_request_openai_completions
,
"lmdeploy"
:
async_request_openai_completions
,
"deepspeed-mii"
:
async_request_deepspeed_mii
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"tensorrt-llm"
:
async_request_trt_llm
,
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
}
072/benchmark_servein_0.7.2.py
0 → 100644
View file @
49c10c0c
# SPDX-License-Identifier: Apache-2.0
r
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
--model <your_model> \
--dataset-name sharegpt \
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add
--endpoint /generate_stream
to the end of the command above.
"""
import
argparse
import
asyncio
import
base64
import
gc
import
io
import
json
import
os
import
random
import
time
import
warnings
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
typing
import
Any
,
AsyncGenerator
,
Collection
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
RequestFuncOutput
)
from
datasets
import
load_dataset
from
PIL.Image
import
Image
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
try
:
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
except
ImportError
:
from
backend_request_func
import
get_tokenizer
try
:
from
vllm.utils
import
FlexibleArgumentParser
except
ImportError
:
from
argparse
import
ArgumentParser
as
FlexibleArgumentParser
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
request_goodput
:
float
output_throughput
:
float
total_token_throughput
:
float
mean_ttft_ms
:
float
median_ttft_ms
:
float
std_ttft_ms
:
float
percentiles_ttft_ms
:
List
[
Tuple
[
float
,
float
]]
mean_tpot_ms
:
float
median_tpot_ms
:
float
std_tpot_ms
:
float
percentiles_tpot_ms
:
List
[
Tuple
[
float
,
float
]]
mean_itl_ms
:
float
median_itl_ms
:
float
std_itl_ms
:
float
percentiles_itl_ms
:
List
[
Tuple
[
float
,
float
]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms
:
float
median_e2el_ms
:
float
std_e2el_ms
:
float
percentiles_e2el_ms
:
List
[
Tuple
[
float
,
float
]]
def
sample_sharegpt_requests
(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
int
,
int
,
None
]]:
# Load the dataset.
with
open
(
dataset_path
,
encoding
=
'utf-8'
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Only keep the first two turns of each conversation.
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
dataset
[
i
][
0
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
dataset
[
i
][
1
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
(
fixed_output_len
is
None
and
output_len
<
4
):
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
,
None
))
return
filtered_dataset
def
sample_sonnet_requests
(
dataset_path
:
str
,
num_requests
:
int
,
input_len
:
int
,
output_len
:
int
,
prefix_len
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
int
,
None
]]:
assert
(
input_len
>
prefix_len
),
"'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
# Load the dataset.
with
open
(
dataset_path
,
encoding
=
'utf-8'
)
as
f
:
poem_lines
=
f
.
readlines
()
# Tokenize the poem lines.
poem_token_ids
=
tokenizer
(
poem_lines
).
input_ids
average_poem_len
=
sum
(
len
(
token_ids
)
for
token_ids
in
poem_token_ids
)
/
len
(
poem_token_ids
)
# Base prefix for all requests.
base_prompt
=
"Pick as many lines as you can from these poem lines:
\n
"
base_message
=
[{
"role"
:
"user"
,
"content"
:
base_prompt
,
}]
base_prompt_formatted
=
tokenizer
.
apply_chat_template
(
base_message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
base_prompt_offset
=
len
(
tokenizer
(
base_prompt_formatted
).
input_ids
)
assert
(
input_len
>
base_prompt_offset
),
f
"Please set 'args.sonnet-input-len' higher than
{
base_prompt_offset
}
."
num_input_lines
=
round
(
(
input_len
-
base_prompt_offset
)
/
average_poem_len
)
# First approximately `prefix_len` number of tokens in the
# prompt are fixed poem lines.
assert
(
prefix_len
>
base_prompt_offset
),
f
"Please set 'args.sonnet-prefix-len' higher than
{
base_prompt_offset
}
."
num_prefix_lines
=
round
(
(
prefix_len
-
base_prompt_offset
)
/
average_poem_len
)
prefix_lines
=
poem_lines
[:
num_prefix_lines
]
# Sample the rest of lines per request.
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
_
in
range
(
num_requests
):
num_lines_needed
=
num_input_lines
-
num_prefix_lines
sampled_lines
=
""
.
join
(
prefix_lines
+
random
.
choices
(
poem_lines
,
k
=
num_lines_needed
))
prompt
=
f
"
{
base_prompt
}{
sampled_lines
}
"
message
=
[
{
"role"
:
"user"
,
"content"
:
prompt
,
},
]
prompt_formatted
=
tokenizer
.
apply_chat_template
(
message
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
sampled_requests
.
append
(
(
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
None
))
return
sampled_requests
def
sample_vision_arena_requests
(
dataset
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
break
prompt
=
data
[
"turns"
][
0
][
0
][
'content'
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
if
fixed_output_len
is
None
:
# Default max output len is set to 128
print
(
"--hf-output-len is not provided. Using default value 128."
)
fixed_output_len
=
128
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
fixed_output_len
assert
isinstance
(
data
[
"images"
][
0
],
Image
),
(
"Input image format must be `PIL.Image.Image`, "
f
"given
{
type
(
data
[
'image'
])
}
."
)
image
:
Image
=
data
[
"images"
][
0
]
image
=
image
.
convert
(
"RGB"
)
image_data
=
io
.
BytesIO
()
image
.
save
(
image_data
,
format
=
'JPEG'
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
mm_content
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
}
sampled_requests
.
append
((
prompt
,
prompt_len
,
output_len
,
mm_content
))
return
sampled_requests
def
sample_hf_requests
(
dataset_path
:
str
,
dataset_subset
:
Optional
[
str
],
dataset_split
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
random_seed
:
int
,
fixed_output_len
:
Optional
[
int
]
=
None
,
)
->
List
[
Tuple
[
str
,
str
,
int
,
Optional
[
Dict
[
str
,
Collection
[
str
]]]]]:
# Special case for vision_arena dataset
if
dataset_path
==
'lmarena-ai/vision-arena-bench-v0.1'
\
and
dataset_subset
is
None
:
assert
dataset_split
==
"train"
dataset
=
load_dataset
(
dataset_path
,
name
=
dataset_subset
,
split
=
dataset_split
,
streaming
=
True
)
dataset
=
dataset
.
shuffle
(
seed
=
random_seed
)
return
sample_vision_arena_requests
(
dataset
,
num_requests
,
tokenizer
,
fixed_output_len
)
dataset
=
load_dataset
(
dataset_path
,
name
=
dataset_subset
,
split
=
dataset_split
,
streaming
=
True
)
assert
"conversations"
in
dataset
.
features
,
(
"HF Dataset must have 'conversations' column."
)
filter_func
=
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
filtered_dataset
=
dataset
.
shuffle
(
seed
=
random_seed
).
filter
(
filter_func
)
sampled_requests
:
List
[
Tuple
[
str
,
int
,
int
,
Dict
[
str
,
Collection
[
str
]]]]
=
[]
for
data
in
filtered_dataset
:
if
len
(
sampled_requests
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
data
[
"conversations"
][
1
][
"value"
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
fixed_output_len
is
None
and
(
prompt_len
<
4
or
output_len
<
4
):
# Prune too short sequences.
continue
if
fixed_output_len
is
None
and
\
(
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
):
# Prune too long sequences.
continue
if
"image"
in
data
and
isinstance
(
data
[
"image"
],
Image
):
image
:
Image
=
data
[
"image"
]
image
=
image
.
convert
(
"RGB"
)
image_data
=
io
.
BytesIO
()
image
.
save
(
image_data
,
format
=
'JPEG'
)
image_base64
=
base64
.
b64encode
(
image_data
.
getvalue
()).
decode
(
"utf-8"
)
mm_content
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
image_base64
}
"
},
}
elif
"image"
in
data
and
isinstance
(
data
[
"image"
],
str
):
if
(
data
[
"image"
].
startswith
(
"http://"
)
or
\
data
[
"image"
].
startswith
(
"file://"
)):
image_url
=
data
[
"image"
]
else
:
image_url
=
f
"file://
{
data
[
'image'
]
}
"
mm_content
=
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
},
}
else
:
mm_content
=
None
sampled_requests
.
append
((
prompt
,
prompt_len
,
output_len
,
mm_content
))
return
sampled_requests
def
sample_random_requests
(
prefix_len
:
int
,
input_len
:
int
,
output_len
:
int
,
num_prompts
:
int
,
range_ratio
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
prefix_token_ids
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
prefix_len
).
tolist
()
input_lens
=
np
.
random
.
randint
(
int
(
input_len
*
range_ratio
),
input_len
+
1
,
size
=
num_prompts
,
)
output_lens
=
np
.
random
.
randint
(
int
(
output_len
*
range_ratio
),
output_len
+
1
,
size
=
num_prompts
,
)
offsets
=
np
.
random
.
randint
(
0
,
tokenizer
.
vocab_size
,
size
=
num_prompts
)
input_requests
=
[]
for
i
in
range
(
num_prompts
):
prompt
=
tokenizer
.
decode
(
prefix_token_ids
+
[(
offsets
[
i
]
+
i
+
j
)
%
tokenizer
.
vocab_size
for
j
in
range
(
input_lens
[
i
])])
input_requests
.
append
((
prompt
,
int
(
prefix_len
+
input_lens
[
i
]),
int
(
output_lens
[
i
]),
None
))
return
input_requests
async
def
get_request
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
request_rate
:
float
,
burstiness
:
float
=
1.0
,
)
->
AsyncGenerator
[
Tuple
[
str
,
int
,
int
],
None
]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests
=
iter
(
input_requests
)
# Calculate scale parameter theta to maintain the desired request_rate.
assert
burstiness
>
0
,
(
f
"A positive burstiness factor is expected, but given
{
burstiness
}
."
)
theta
=
1.0
/
(
request_rate
*
burstiness
)
for
request
in
input_requests
:
yield
request
if
request_rate
==
float
(
"inf"
):
# If the request rate is infinity, then we don't need to wait.
continue
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
interval
=
np
.
random
.
gamma
(
shape
=
burstiness
,
scale
=
theta
)
# The next request will be sent after the interval.
await
asyncio
.
sleep
(
interval
)
def
calculate_metrics
(
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
outputs
:
List
[
RequestFuncOutput
],
dur_s
:
float
,
tokenizer
:
PreTrainedTokenizerBase
,
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
float
],
goodput_config_dict
:
Dict
[
str
,
float
],
)
->
Tuple
[
BenchmarkMetrics
,
List
[
int
]]:
actual_output_lens
:
List
[
int
]
=
[]
total_input
=
0
completed
=
0
good_completed
=
0
itls
:
List
[
float
]
=
[]
tpots
:
List
[
float
]
=
[]
all_tpots
:
List
[
float
]
=
[]
ttfts
:
List
[
float
]
=
[]
e2els
:
List
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
output_len
=
outputs
[
i
].
output_tokens
if
output_len
is
None
:
# We use the tokenizer to count the number of output tokens
# for some serving backends instead of looking at
# len(outputs[i].itl) since multiple output tokens may be
# bundled together
# Note : this may inflate the output token count slightly
output_len
=
len
(
tokenizer
(
outputs
[
i
].
generated_text
,
add_special_tokens
=
False
).
input_ids
)
actual_output_lens
.
append
(
output_len
)
total_input
+=
input_requests
[
i
][
1
]
tpot
=
0
if
output_len
>
1
:
latency_minus_ttft
=
outputs
[
i
].
latency
-
outputs
[
i
].
ttft
tpot
=
latency_minus_ttft
/
(
output_len
-
1
)
tpots
.
append
(
tpot
)
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots
.
append
(
tpot
)
itls
+=
outputs
[
i
].
itl
ttfts
.
append
(
outputs
[
i
].
ttft
)
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
else
:
actual_output_lens
.
append
(
0
)
if
goodput_config_dict
:
valid_metrics
=
[]
slo_values
=
[]
if
"ttft"
in
goodput_config_dict
:
valid_metrics
.
append
(
ttfts
)
slo_values
.
append
(
goodput_config_dict
[
"ttft"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"tpot"
in
goodput_config_dict
:
valid_metrics
.
append
(
all_tpots
)
slo_values
.
append
(
goodput_config_dict
[
"tpot"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
if
"e2el"
in
goodput_config_dict
:
valid_metrics
.
append
(
e2els
)
slo_values
.
append
(
goodput_config_dict
[
"e2el"
]
/
MILLISECONDS_TO_SECONDS_CONVERSION
)
for
req_metric
in
zip
(
*
valid_metrics
):
is_good_req
=
all
([
s
>=
r
for
s
,
r
in
zip
(
slo_values
,
req_metric
)])
if
is_good_req
:
good_completed
+=
1
if
completed
==
0
:
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments."
,
stacklevel
=
2
)
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
total_output
=
sum
(
actual_output_lens
),
request_throughput
=
completed
/
dur_s
,
request_goodput
=
good_completed
/
dur_s
,
output_throughput
=
sum
(
actual_output_lens
)
/
dur_s
,
total_token_throughput
=
(
total_input
+
sum
(
actual_output_lens
))
/
dur_s
,
mean_ttft_ms
=
np
.
mean
(
ttfts
or
0
)
*
1000
,
# ttfts is empty if streaming is not supported by backend
std_ttft_ms
=
np
.
std
(
ttfts
or
0
)
*
1000
,
median_ttft_ms
=
np
.
median
(
ttfts
or
0
)
*
1000
,
percentiles_ttft_ms
=
[(
p
,
np
.
percentile
(
ttfts
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_tpot_ms
=
np
.
mean
(
tpots
or
0
)
*
1000
,
std_tpot_ms
=
np
.
std
(
tpots
or
0
)
*
1000
,
median_tpot_ms
=
np
.
median
(
tpots
or
0
)
*
1000
,
percentiles_tpot_ms
=
[(
p
,
np
.
percentile
(
tpots
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_itl_ms
=
np
.
mean
(
itls
or
0
)
*
1000
,
std_itl_ms
=
np
.
std
(
itls
or
0
)
*
1000
,
median_itl_ms
=
np
.
median
(
itls
or
0
)
*
1000
,
percentiles_itl_ms
=
[(
p
,
np
.
percentile
(
itls
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
mean_e2el_ms
=
np
.
mean
(
e2els
or
0
)
*
1000
,
std_e2el_ms
=
np
.
std
(
e2els
or
0
)
*
1000
,
median_e2el_ms
=
np
.
median
(
e2els
or
0
)
*
1000
,
percentiles_e2el_ms
=
[(
p
,
np
.
percentile
(
e2els
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
)
return
metrics
,
actual_output_lens
async
def
benchmark
(
backend
:
str
,
api_url
:
str
,
base_url
:
str
,
model_id
:
str
,
model_name
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
input_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
logprobs
:
Optional
[
int
],
best_of
:
int
,
request_rate
:
float
,
burstiness
:
float
,
disable_tqdm
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
str
],
ignore_eos
:
bool
,
goodput_config_dict
:
Dict
[
str
,
float
],
max_concurrency
:
Optional
[
int
],
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
else
:
raise
ValueError
(
f
"Unknown backend:
{
backend
}
"
)
print
(
"Starting initial single prompt test run..."
)
test_prompt
,
test_prompt_len
,
test_output_len
,
test_mm_content
=
(
input_requests
[
0
])
if
backend
!=
"openai-chat"
and
test_mm_content
is
not
None
:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' backend."
)
test_input
=
RequestFuncInput
(
model
=
model_id
,
model_name
=
model_name
,
prompt
=
test_prompt
,
api_url
=
api_url
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
,
)
test_output
=
await
request_func
(
request_func_input
=
test_input
)
if
not
test_output
.
success
:
raise
ValueError
(
"Initial test run failed - Please make sure benchmark arguments "
f
"are correctly specified. Error:
{
test_output
.
error
}
"
)
else
:
print
(
"Initial test run completed. Starting main benchmark run..."
)
if
profile
:
print
(
"Starting profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
model_name
=
model_name
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/start_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler started"
)
if
burstiness
==
1.0
:
distribution
=
"Poisson process"
else
:
distribution
=
"Gamma distribution"
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
print
(
f
"Burstiness factor:
{
burstiness
}
(
{
distribution
}
)"
)
print
(
f
"Maximum request concurrency:
{
max_concurrency
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore
=
(
asyncio
.
Semaphore
(
max_concurrency
)
if
max_concurrency
else
None
)
async
def
limited_request_func
(
request_func_input
,
pbar
):
if
semaphore
is
None
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
,
burstiness
):
prompt
,
prompt_len
,
output_len
,
mm_content
=
request
request_func_input
=
RequestFuncInput
(
model
=
model_id
,
model_name
=
model_name
,
prompt
=
prompt
,
api_url
=
api_url
,
prompt_len
=
prompt_len
,
output_len
=
output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
)
tasks
.
append
(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
if
profile
:
print
(
"Stopping profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
prompt
=
test_prompt
,
api_url
=
base_url
+
"/stop_profile"
,
prompt_len
=
test_prompt_len
,
output_len
=
test_output_len
,
logprobs
=
logprobs
,
best_of
=
best_of
,
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler stopped"
)
if
pbar
is
not
None
:
pbar
.
close
()
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
metrics
,
actual_output_lens
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
tokenizer
=
tokenizer
,
selected_percentile_metrics
=
selected_percentile_metrics
,
selected_percentiles
=
selected_percentiles
,
goodput_config_dict
=
goodput_config_dict
,
)
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
if
goodput_config_dict
:
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
metrics
.
request_goodput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
metrics
.
total_token_throughput
))
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_throughput"
:
metrics
.
request_throughput
,
"request_goodput:"
:
metrics
.
request_goodput
if
goodput_config_dict
else
None
,
"output_throughput"
:
metrics
.
output_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"output_lens"
:
actual_output_lens
,
"ttfts"
:
[
output
.
ttft
for
output
in
outputs
],
"itls"
:
[
output
.
itl
for
output
in
outputs
],
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
def
process_one_metric
(
# E.g., "ttft"
metric_attribute_name
:
str
,
# E.g., "TTFT"
metric_name
:
str
,
# E.g., "Time to First Token"
metric_header
:
str
,
):
# This function prints and adds statistics of the specified
# metric.
if
metric_attribute_name
not
in
selected_percentile_metrics
:
return
print
(
"{s:{c}^{n}}"
.
format
(
s
=
metric_header
,
n
=
50
,
c
=
'-'
))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Mean
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)))
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"Median
{
metric_name
}
(ms):"
,
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)))
result
[
f
"mean_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"mean_
{
metric_attribute_name
}
_ms"
)
result
[
f
"median_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"median_
{
metric_attribute_name
}
_ms"
)
result
[
f
"std_
{
metric_attribute_name
}
_ms"
]
=
getattr
(
metrics
,
f
"std_
{
metric_attribute_name
}
_ms"
)
for
p
,
value
in
getattr
(
metrics
,
f
"percentiles_
{
metric_attribute_name
}
_ms"
):
p_word
=
str
(
int
(
p
))
if
int
(
p
)
==
p
else
str
(
p
)
print
(
"{:<40} {:<10.2f}"
.
format
(
f
"P
{
p_word
}
{
metric_name
}
(ms):"
,
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
print
(
"="
*
50
)
return
result
def
check_goodput_args
(
args
):
# Check and parse goodput arguments
goodput_config_dict
=
{}
VALID_NAMES
=
[
"ttft"
,
"tpot"
,
"e2el"
]
if
args
.
goodput
:
goodput_config_dict
=
parse_goodput
(
args
.
goodput
)
for
slo_name
,
slo_val
in
goodput_config_dict
.
items
():
if
slo_name
not
in
VALID_NAMES
:
raise
ValueError
(
f
"Invalid metric name found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective name should be one of "
f
"
{
str
(
VALID_NAMES
)
}
. "
)
if
slo_val
<
0
:
raise
ValueError
(
f
"Invalid value found,
{
slo_name
}
:
{
slo_val
}
. "
"The service level objective value should be "
"non-negative."
)
return
goodput_config_dict
def
parse_goodput
(
slo_pairs
):
goodput_config_dict
=
{}
try
:
for
slo_pair
in
slo_pairs
:
slo_name
,
slo_val
=
slo_pair
.
split
(
":"
)
goodput_config_dict
[
slo_name
]
=
float
(
slo_val
)
except
ValueError
as
err
:
raise
argparse
.
ArgumentTypeError
(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds."
)
from
err
return
goodput_config_dict
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
np
.
random
.
seed
(
args
.
seed
)
backend
=
args
.
backend
model_id
=
args
.
model
model_name
=
args
.
served_model_name
tokenizer_id
=
args
.
tokenizer
if
args
.
tokenizer
is
not
None
else
args
.
model
tokenizer_mode
=
args
.
tokenizer_mode
if
args
.
base_url
is
not
None
:
api_url
=
f
"
{
args
.
base_url
}{
args
.
endpoint
}
"
base_url
=
f
"
{
args
.
base_url
}
"
else
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}{
args
.
endpoint
}
"
base_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
"
tokenizer
=
get_tokenizer
(
tokenizer_id
,
tokenizer_mode
=
tokenizer_mode
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
not
None
:
warnings
.
warn
(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs."
,
stacklevel
=
2
)
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
elif
args
.
dataset_name
==
"sharegpt"
:
input_requests
=
sample_sharegpt_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
sharegpt_output_len
,
)
elif
args
.
dataset_name
==
"sonnet"
:
# Do not format the prompt, pass to message directly
if
args
.
backend
==
"openai-chat"
:
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
_
in
input_requests
]
else
:
assert
(
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
),
"Tokenizer/model must have chat template for sonnet dataset."
input_requests
=
sample_sonnet_requests
(
dataset_path
=
args
.
dataset_path
,
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
output_len
=
args
.
sonnet_output_len
,
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
)
input_requests
=
[(
prompt_formatted
,
prompt_len
,
output_len
,
None
)
for
prompt
,
prompt_formatted
,
prompt_len
,
output_len
,
_
in
input_requests
]
elif
args
.
dataset_name
==
"hf"
:
input_requests
=
sample_hf_requests
(
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
dataset_split
=
args
.
hf_split
,
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
random_seed
=
args
.
seed
,
fixed_output_len
=
args
.
hf_output_len
,
)
elif
args
.
dataset_name
==
"random"
:
input_requests
=
sample_random_requests
(
prefix_len
=
args
.
random_prefix_len
,
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
num_prompts
=
args
.
num_prompts
,
range_ratio
=
args
.
random_range_ratio
,
tokenizer
=
tokenizer
,
)
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
goodput_config_dict
=
check_goodput_args
(
args
)
# Avoid GC processing "static" data - reduce pause times.
gc
.
collect
()
gc
.
freeze
()
benchmark_result
=
asyncio
.
run
(
benchmark
(
backend
=
backend
,
api_url
=
api_url
,
base_url
=
base_url
,
model_id
=
model_id
,
model_name
=
model_name
,
tokenizer
=
tokenizer
,
input_requests
=
input_requests
,
logprobs
=
args
.
logprobs
,
best_of
=
args
.
best_of
,
request_rate
=
args
.
request_rate
,
burstiness
=
args
.
burstiness
,
disable_tqdm
=
args
.
disable_tqdm
,
profile
=
args
.
profile
,
selected_percentile_metrics
=
args
.
percentile_metrics
.
split
(
","
),
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
ignore_eos
=
args
.
ignore_eos
,
goodput_config_dict
=
goodput_config_dict
,
max_concurrency
=
args
.
max_concurrency
,
))
# Save config and results to json
if
args
.
save_result
:
result_json
:
Dict
[
str
,
Any
]
=
{}
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"backend"
]
=
backend
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
result_json
[
"best_of"
]
=
args
.
best_of
result_json
[
"num_prompts"
]
=
args
.
num_prompts
# Metadata
if
args
.
metadata
:
for
item
in
args
.
metadata
:
if
"="
in
item
:
kvstring
=
item
.
split
(
"="
)
result_json
[
kvstring
[
0
].
strip
()]
=
kvstring
[
1
].
strip
()
else
:
raise
ValueError
(
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"burstiness"
]
=
args
.
burstiness
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
max_concurrency_str
=
(
f
"-concurrency
{
args
.
max_concurrency
}
"
if
args
.
max_concurrency
is
not
None
else
""
)
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_filename
:
file_name
=
args
.
result_filename
if
args
.
result_dir
:
file_name
=
os
.
path
.
join
(
args
.
result_dir
,
file_name
)
with
open
(
file_name
,
"w"
,
encoding
=
'utf-8'
)
as
outfile
:
json
.
dump
(
result_json
,
outfile
)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the online serving throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--base-url"
,
type
=
str
,
default
=
None
,
help
=
"Server or API base url if not using http host and port."
,
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--endpoint"
,
type
=
str
,
default
=
"/v1/completions"
,
help
=
"API endpoint."
,
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the ShareGPT dataset, will be deprecated in the "
"next release."
,
)
parser
.
add_argument
(
"--dataset-name"
,
type
=
str
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"sonnet"
,
"random"
,
"hf"
],
help
=
"Name of the dataset to benchmark on."
,
)
parser
.
add_argument
(
"--dataset-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--max-concurrency"
,
type
=
int
,
default
=
None
,
help
=
"Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"Name of the model."
,
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
)
parser
.
add_argument
(
"--best-of"
,
type
=
int
,
default
=
1
,
help
=
"Generates `best_of` sequences per prompt and "
"returns the best one."
,
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
,
)
parser
.
add_argument
(
"--logprobs"
,
type
=
int
,
default
=
None
,
help
=
(
"Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
)
parser
.
add_argument
(
"--request-rate"
,
type
=
float
,
default
=
float
(
"inf"
),
help
=
"Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times."
,
)
parser
.
add_argument
(
"--burstiness"
,
type
=
float
,
default
=
1.0
,
help
=
"Burstiness factor of the request generation. "
"Only take effect when request_rate is not inf. "
"Default value is 1, which follows Poisson process. "
"Otherwise, the request intervals follow a gamma distribution. "
"A lower burstiness value (0 < burstiness < 1) results in more "
"bursty requests. A higher burstiness value (burstiness > 1) "
"results in a more uniform arrival of requests."
,
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--trust-remote-code"
,
action
=
"store_true"
,
help
=
"Trust remote code from huggingface"
,
)
parser
.
add_argument
(
"--disable-tqdm"
,
action
=
"store_true"
,
help
=
"Specify to disable tqdm progress bar."
,
)
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler. The endpoint must be launched with "
"VLLM_TORCH_PROFILER_DIR to enable profiler."
,
)
parser
.
add_argument
(
"--save-result"
,
action
=
"store_true"
,
help
=
"Specify to save benchmark results to a json file"
,
)
parser
.
add_argument
(
"--metadata"
,
metavar
=
"KEY=VALUE"
,
nargs
=
"*"
,
help
=
"Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
"for metadata of this run to be saved in the result JSON file "
"for record keeping purposes."
,
)
parser
.
add_argument
(
"--result-dir"
,
type
=
str
,
default
=
None
,
help
=
"Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory."
,
)
parser
.
add_argument
(
"--result-filename"
,
type
=
str
,
default
=
None
,
help
=
"Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format."
,
)
parser
.
add_argument
(
"--ignore-eos"
,
action
=
"store_true"
,
help
=
"Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi."
)
parser
.
add_argument
(
"--percentile-metrics"
,
type
=
str
,
default
=
"ttft,tpot,itl"
,
help
=
"Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are
\"
ttft
\"
,
\"
tpot
\"
,
\"
itl
\"
,
\"
e2el
\"
. "
"Default value is
\"
ttft,tpot,itl
\"
."
)
parser
.
add_argument
(
"--metric-percentiles"
,
type
=
str
,
default
=
"99"
,
help
=
"Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use
\"
25,50,75
\"
. "
"Default value is
\"
99
\"
. "
"Use
\"
--percentile-metrics
\"
to select metrics."
,
)
parser
.
add_argument
(
"--goodput"
,
nargs
=
"+"
,
required
=
False
,
help
=
"Specify service level objectives for goodput as
\"
KEY:VALUE
\"
"
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple
\"
KEY:VALUE
\"
pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
"
\"
ttft
\"
,
\"
tpot
\"
,
\"
e2el
\"
. For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
)
# group for dataset specific arguments
sonnet_group
=
parser
.
add_argument_group
(
"sonnet dataset options"
)
sonnet_group
.
add_argument
(
"--sonnet-input-len"
,
type
=
int
,
default
=
550
,
help
=
"Number of input tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-output-len"
,
type
=
int
,
default
=
150
,
help
=
"Number of output tokens per request, used only for sonnet dataset."
,
)
sonnet_group
.
add_argument
(
"--sonnet-prefix-len"
,
type
=
int
,
default
=
200
,
help
=
"Number of prefix tokens per request, used only for sonnet dataset."
,
)
sharegpt_group
=
parser
.
add_argument_group
(
"sharegpt dataset options"
)
sharegpt_group
.
add_argument
(
"--sharegpt-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output length "
"from the ShareGPT dataset."
)
random_group
=
parser
.
add_argument_group
(
"random dataset options"
)
random_group
.
add_argument
(
"--random-input-len"
,
type
=
int
,
default
=
1024
,
help
=
"Number of input tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-output-len"
,
type
=
int
,
default
=
128
,
help
=
"Number of output tokens per request, used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-range-ratio"
,
type
=
float
,
default
=
1.0
,
help
=
"Range of sampled ratio of input/output length, "
"used only for random sampling."
,
)
random_group
.
add_argument
(
"--random-prefix-len"
,
type
=
int
,
default
=
0
,
help
=
"Number of fixed prefix tokens before random "
" context. The length range of context in a random "
" request is [random-prefix-len, "
" random-prefix-len + random-prefix-len * random-range-ratio)."
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
type
=
str
,
default
=
None
,
help
=
"Subset of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-split"
,
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-output-len"
,
type
=
int
,
default
=
None
,
help
=
"Output length for each request. Overrides the output lengths "
"from the sampled HF dataset."
,
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
"auto"
,
choices
=
[
'auto'
,
'slow'
,
'mistral'
],
help
=
'The tokenizer mode.
\n\n
* "auto" will use the '
'fast tokenizer if available.
\n
* "slow" will '
'always use the slow tokenizer.
\n
* '
'"mistral" will always use the `mistral_common` tokenizer.'
)
parser
.
add_argument
(
"--served-model-name"
,
type
=
str
,
default
=
None
,
help
=
"The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. "
)
args
=
parser
.
parse_args
()
main
(
args
)
072/benchmark_throughput_0.7.2.py
0 → 100644
View file @
49c10c0c
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import
argparse
import
dataclasses
import
json
import
random
import
time
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
PIL
import
Image
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptType
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.inputs
import
TextPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.utils
import
get_adapter_absolute_path
from
vllm.multimodal
import
MultiModalDataDict
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
,
get_lora_tokenizer
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
@
dataclasses
.
dataclass
class
SampleRequest
:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model
=
model
.
lower
()
if
"pixtral"
in
model
:
return
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
model
:
str
=
args
.
model
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Only keep the first two turns of each conversation.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
completion
=
data
[
"conversations"
][
1
][
"value"
]
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
if
"image"
in
data
:
multi_modal_data
=
multi_modal_data
or
{}
image_path
=
data
[
"image"
]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert
isinstance
(
image_path
,
str
),
"Only support single image input"
try
:
multi_modal_data
[
"image"
]
=
Image
.
open
(
image_path
).
convert
(
"RGB"
)
except
FileNotFoundError
:
# Ignore datapoint where asset is missing
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
request_tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
def
run_vllm
(
requests_json
:
List
[
SampleRequest
],
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
# warmup
warmup_sampling_params
=
SamplingParams
(
n
=
args
.
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
10
,
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
use_tqdm
=
False
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
use_beam_search
=
False
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
request
in
requests
:
assert
request
.
expected_output_len
==
output_len
start
=
time
.
perf_counter
()
real_output
=
llm
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
,
))
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
total_num_tokens
=
sum
(
request
.
prompt_len
+
request
.
expected_output_len
for
request
in
requests
)
total_output_tokens
=
sum
(
request
.
expected_output_len
for
request
in
requests
)
# ttft_mean = np.mean(total_ttfts)
# ttft_median = np.median(total_ttfts or 0)
# ttft_p99 = np.percentile(total_ttfts or 0, 99)
# tpop_mean = np.mean(total_tpops)
# tpop_median = np.median(total_tpops or 0)
# tpop_p99 = np.percentile(total_tpops or 0, 99)
# output_token_throughput_mean = np.mean(total_output_token_throughput)
# output_token_throughput_median = np.median(total_output_token_throughput or 0)
# output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
# inout_token_throughput_mean = np.mean(total_inout_token_throughput)
# inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
# inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]:
.
5
f
}
s"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]:
.
5
f
}
s"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]:
.
5
f
}
s"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]:
.
5
f
}
s"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]:
.
5
f
}
s"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]:
.
5
f
}
s"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
True
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
request
.
prompt
for
request
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
requests
=
[]
for
_
in
range
(
ELEprompt
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
for
_
in
range
(
ELEinput
)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
request_tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
ELEinput
:
break
# Adjust length based on difference
diff
=
ELEinput
-
tokenized_len
if
diff
>
0
:
candidate_ids
.
extend
([
random
.
randint
(
100
,
vocab_size
-
100
)
for
_
in
range
(
diff
)
])
else
:
candidate_ids
=
candidate_ids
[:
diff
]
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
ELEinput
,
expected_output_len
=
ELEoutput
,
lora_request
=
lora_request
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
requests
,
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
))
else
:
info_json
=
run_vllm
(
requests_json
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
if
is_multi_modal
:
print
(
"
\033
[91mWARNING
\033
[0m: Multi-modal request detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# json.dump(info_json, f, indent=4)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
nargs
=
"*"
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment