Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f09edd8a
Unverified
Commit
f09edd8a
authored
May 16, 2024
by
Simon Mo
Committed by
GitHub
May 16, 2024
Browse files
Add JSON output support for benchmark_latency and benchmark_throughput (#4848)
parent
6979ade3
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
39 additions
and
5 deletions
+39
-5
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+4
-3
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+18
-2
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+17
-0
No files found.
.buildkite/run-benchmarks.sh
View file @
f09edd8a
...
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
...
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
# run python-based benchmarks and upload the result to buildkite
# run python-based benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 |
tee
benchmark_latency.txt
python3 benchmarks/benchmark_latency.py
--output-json
latency_results.json
2>&1 |
tee
benchmark_latency.txt
bench_latency_exit_code
=
$?
bench_latency_exit_code
=
$?
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256 2>&1 |
tee
benchmark_throughput.txt
python3 benchmarks/benchmark_throughput.py
--input-len
256
--output-len
256
--output-json
throughput_results.json
2>&1 |
tee
benchmark_throughput.txt
bench_throughput_exit_code
=
$?
bench_throughput_exit_code
=
$?
# run server-based benchmarks and upload the result to buildkite
# run server-based benchmarks and upload the result to buildkite
...
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
...
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit
$bench_serving_exit_code
exit
$bench_serving_exit_code
fi
fi
/workspace/buildkite-agent artifact upload openai-
*
.json
rm
ShareGPT_V3_unfiltered_cleaned_split.json
/workspace/buildkite-agent artifact upload
"*.json"
benchmarks/benchmark_latency.py
View file @
f09edd8a
"""Benchmark the latency of processing a single batch of requests."""
"""Benchmark the latency of processing a single batch of requests."""
import
argparse
import
argparse
import
json
import
time
import
time
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Optional
from
typing
import
Optional
...
@@ -96,6 +97,16 @@ def main(args: argparse.Namespace):
...
@@ -96,6 +97,16 @@ def main(args: argparse.Namespace):
for
percentage
,
percentile
in
zip
(
percentages
,
percentiles
):
for
percentage
,
percentile
in
zip
(
percentages
,
percentiles
):
print
(
f
'
{
percentage
}
% percentile latency:
{
percentile
}
seconds'
)
print
(
f
'
{
percentage
}
% percentile latency:
{
percentile
}
seconds'
)
# Output JSON results if specified
if
args
.
output_json
:
results
=
{
"avg_latency"
:
np
.
mean
(
latencies
),
"latencies"
:
latencies
.
tolist
(),
"percentiles"
:
dict
(
zip
(
percentages
,
percentiles
.
tolist
())),
}
with
open
(
args
.
output_json
,
"w"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
4
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
...
@@ -149,8 +160,8 @@ if __name__ == '__main__':
...
@@ -149,8 +160,8 @@ if __name__ == '__main__':
help
=
help
=
'Data type for kv cache storage. If "auto", will use model data type. '
'Data type for kv cache storage. If "auto", will use model data type. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is
instead supported for
'
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'common inference criteria.'
)
'
instead supported for
common inference criteria.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--quantization-param-path'
,
'--quantization-param-path'
,
type
=
str
,
type
=
str
,
...
@@ -197,5 +208,10 @@ if __name__ == '__main__':
...
@@ -197,5 +208,10 @@ if __name__ == '__main__':
default
=
None
,
default
=
None
,
help
=
'directory to download and load the weights, '
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the latency results in JSON format.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_throughput.py
View file @
f09edd8a
...
@@ -242,6 +242,18 @@ def main(args: argparse.Namespace):
...
@@ -242,6 +242,18 @@ def main(args: argparse.Namespace):
print
(
f
"Throughput:
{
len
(
requests
)
/
elapsed_time
:.
2
f
}
requests/s, "
print
(
f
"Throughput:
{
len
(
requests
)
/
elapsed_time
:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
elapsed_time
:.
2
f
}
tokens/s"
)
f
"
{
total_num_tokens
/
elapsed_time
:.
2
f
}
tokens/s"
)
# Output JSON results if specified
if
args
.
output_json
:
results
=
{
"elapsed_time"
:
elapsed_time
,
"num_requests"
:
len
(
requests
),
"total_num_tokens"
:
total_num_tokens
,
"requests_per_second"
:
len
(
requests
)
/
elapsed_time
,
"tokens_per_second"
:
total_num_tokens
/
elapsed_time
,
}
with
open
(
args
.
output_json
,
"w"
)
as
f
:
json
.
dump
(
results
,
f
,
indent
=
4
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
=
argparse
.
ArgumentParser
(
description
=
"Benchmark the throughput."
)
...
@@ -353,6 +365,11 @@ if __name__ == "__main__":
...
@@ -353,6 +365,11 @@ if __name__ == "__main__":
default
=
None
,
default
=
None
,
help
=
'directory to download and load the weights, '
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
args
.
tokenizer
=
args
.
model
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment