Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f09edd8a
Unverified
Commit
f09edd8a
authored
May 16, 2024
by
Simon Mo
Committed by
GitHub
May 16, 2024
Browse files
Add JSON output support for benchmark_latency and benchmark_throughput (#4848)
parent
6979ade3
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
5 deletions
+39
-5
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+4
-3
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+18
-2
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+17
-0
No files found.
.buildkite/run-benchmarks.sh
View file @
f09edd8a
...
...
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
python3 benchmarks/benchmark_latency.py
--output-json
latency_results.json
2>&1 |
tee
benchmark_latency.txt
bench_latency_exit_code
=
$?
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256
--output-json
throughput_results.json
2>&1 |
tee
benchmark_throughput.txt
bench_throughput_exit_code
=
$?
# run server-based benchmarks and upload the result to buildkite
...
...
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit
$bench_serving_exit_code
fi
/workspace/buildkite-agent artifact upload openai-
*
.json
rm
ShareGPT_V3_unfiltered_cleaned_split.json
/workspace/buildkite-agent artifact upload
"*.json"
benchmarks/benchmark_latency.py
View file @
f09edd8a
"""Benchmark the latency of processing a single batch of requests."""
import
argparse
import
json
import
time
from
pathlib
import
Path
from
typing
import
Optional
...
...
@@ -96,6 +97,16 @@ def main(args: argparse.Namespace):
for
percentage
,
percentile
in
zip
(
percentages
,
percentiles
):
print
(
f
'
{
percentage
}
% percentile latency:
{
percentile
}
seconds'
)
# Output JSON results if specified
if
args
.
output_json
:
results
=
{
"avg_latency"
:
np
.
mean
(
latencies
),
"latencies"
:
latencies
.
tolist
(),
"percentiles"
:
dict
(
zip
(
percentages
,
percentiles
.
tolist
())),
}
with
open
(
args
.
output_json
,
"w"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
4
)
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
...
...
@@ -149,8 +160,8 @@ if __name__ == '__main__':
help
=
'Data type for kv cache storage. If "auto", will use model data type. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is
instead supported for
'
'common inference criteria.'
)
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'
instead supported for
common inference criteria.'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
str
,
...
...
@@ -197,5 +208,10 @@ if __name__ == '__main__':
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the latency results in JSON format.'
)
args
=
parser
.
parse_args
()
main
(
args
)
benchmarks/benchmark_throughput.py
View file @
f09edd8a
...
...
@@ -242,6 +242,18 @@ def main(args: argparse.Namespace):
print
(
f
"Throughput:
{
len
(
requests
)
/
elapsed_time
:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
elapsed_time
:.
2
f
}
tokens/s"
)
# Output JSON results if specified
if
args
.
output_json
:
results
=
{
"elapsed_time"
:
elapsed_time
,
"num_requests"
:
len
(
requests
),
"total_num_tokens"
:
total_num_tokens
,
"requests_per_second"
:
len
(
requests
)
/
elapsed_time
,
"tokens_per_second"
:
total_num_tokens
/
elapsed_time
,
}
with
open
(
args
.
output_json
,
"w"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
4
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the throughput."
)
...
...
@@ -353,6 +365,11 @@ if __name__ == "__main__":
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment