Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7dbe738d
Unverified
Commit
7dbe738d
authored
Oct 18, 2024
by
Russell Bryant
Committed by
GitHub
Oct 18, 2024
Browse files
[Misc] benchmark: Add option to set max concurrency (#9390)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
ae8b633b
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
3 deletions
+37
-3
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+37
-3
No files found.
benchmarks/benchmark_serving.py
View file @
7dbe738d
...
...
@@ -398,6 +398,7 @@ async def benchmark(
selected_percentile_metrics
:
List
[
str
],
selected_percentiles
:
List
[
str
],
ignore_eos
:
bool
,
max_concurrency
:
Optional
[
int
],
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
...
@@ -446,9 +447,25 @@ async def benchmark(
print
(
"Profiler started"
)
print
(
f
"Traffic request rate:
{
request_rate
}
"
)
print
(
f
"Maximum request concurrency:
{
max_concurrency
}
"
)
pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
len
(
input_requests
))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore
=
(
asyncio
.
Semaphore
(
max_concurrency
)
if
max_concurrency
else
None
)
async
def
limited_request_func
(
request_func_input
,
pbar
):
if
semaphore
is
None
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
async
with
semaphore
:
return
await
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
benchmark_start_time
=
time
.
perf_counter
()
tasks
:
List
[
asyncio
.
Task
]
=
[]
async
for
request
in
get_request
(
input_requests
,
request_rate
):
...
...
@@ -464,7 +481,7 @@ async def benchmark(
ignore_eos
=
ignore_eos
)
tasks
.
append
(
asyncio
.
create_task
(
request_func
(
request_func_input
=
request_func_input
,
limited_
request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)))
outputs
:
List
[
RequestFuncOutput
]
=
await
asyncio
.
gather
(
*
tasks
)
...
...
@@ -682,6 +699,7 @@ def main(args: argparse.Namespace):
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)
],
ignore_eos
=
args
.
ignore_eos
,
max_concurrency
=
args
.
max_concurrency
,
))
# Save config and results to json
...
...
@@ -711,13 +729,16 @@ def main(args: argparse.Namespace):
# Traffic
result_json
[
"request_rate"
]
=
(
args
.
request_rate
if
args
.
request_rate
<
float
(
"inf"
)
else
"inf"
)
result_json
[
"max_concurrency"
]
=
args
.
max_concurrency
# Merge with benchmark result
result_json
=
{
**
result_json
,
**
benchmark_result
}
# Save to file
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
max_concurrency_str
=
(
f
"-concurrency
{
args
.
max_concurrency
}
"
if
args
.
max_concurrency
is
not
None
else
""
)
file_name
=
f
"
{
backend
}
-
{
args
.
request_rate
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
#noqa
if
args
.
result_filename
:
file_name
=
args
.
result_filename
if
args
.
result_dir
:
...
...
@@ -768,6 +789,19 @@ if __name__ == "__main__":
default
=
None
,
help
=
"Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset."
)
parser
.
add_argument
(
"--max-concurrency"
,
type
=
int
,
default
=
None
,
help
=
"Maximum number of concurrent requests. This can be used "
"to help simulate an environment where a higher level component "
"is enforcing a maximum number of concurrent requests. While the "
"--request-rate argument controls the rate at which requests are "
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment