Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
49a986ec
Unverified
Commit
49a986ec
authored
Nov 18, 2025
by
Ido Segev
Committed by
GitHub
Nov 18, 2025
Browse files
[Benchmark] multi_turn: Report warmup-inclusive runtime (#28937)
Signed-off-by:
Ido Segev
<
idos@pliops.com
>
parent
f6aa1226
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
53 additions
and
10 deletions
+53
-10
benchmarks/multi_turn/README.md
benchmarks/multi_turn/README.md
+4
-0
benchmarks/multi_turn/benchmark_serving_multi_turn.py
benchmarks/multi_turn/benchmark_serving_multi_turn.py
+49
-10
No files found.
benchmarks/multi_turn/README.md
View file @
49a986ec
...
@@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
...
@@ -55,6 +55,10 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
```
```
If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
benchmark-only runtime so the reported throughput stays comparable).
### JSON configuration file for synthetic conversations generation
### JSON configuration file for synthetic conversations generation
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
...
...
benchmarks/multi_turn/benchmark_serving_multi_turn.py
View file @
49a986ec
...
@@ -1076,6 +1076,7 @@ def process_statistics(
...
@@ -1076,6 +1076,7 @@ def process_statistics(
verbose
:
bool
,
verbose
:
bool
,
gen_conv_args
:
GenConvArgs
|
None
=
None
,
gen_conv_args
:
GenConvArgs
|
None
=
None
,
excel_output
:
bool
=
False
,
excel_output
:
bool
=
False
,
warmup_runtime_sec
:
float
|
None
=
None
,
)
->
None
:
)
->
None
:
if
len
(
client_metrics
)
==
0
:
if
len
(
client_metrics
)
==
0
:
logger
.
info
(
"No samples to process"
)
logger
.
info
(
"No samples to process"
)
...
@@ -1169,8 +1170,13 @@ def process_statistics(
...
@@ -1169,8 +1170,13 @@ def process_statistics(
# Convert milliseconds to seconds
# Convert milliseconds to seconds
runtime_sec
=
runtime_sec
/
1000.0
runtime_sec
=
runtime_sec
/
1000.0
requests_per_sec
=
float
(
len
(
df
))
/
runtime_sec
requests_per_sec
=
float
(
len
(
df
))
/
runtime_sec
params
=
{
params
=
{
"runtime_sec"
:
runtime_sec
,
"requests_per_sec"
:
requests_per_sec
}
"runtime_sec"
:
runtime_sec
,
"requests_per_sec"
:
requests_per_sec
,
}
if
warmup_runtime_sec
is
not
None
:
params
[
"warmup_runtime_sec"
]
=
warmup_runtime_sec
params
[
"total_runtime_incl_warmup_sec"
]
=
runtime_sec
+
warmup_runtime_sec
# Generate a summary of relevant metrics (and drop irrelevant data)
# Generate a summary of relevant metrics (and drop irrelevant data)
df
=
df
.
drop
(
columns
=
exclude
).
describe
(
percentiles
=
percentiles
).
transpose
()
df
=
df
.
drop
(
columns
=
exclude
).
describe
(
percentiles
=
percentiles
).
transpose
()
...
@@ -1552,6 +1558,8 @@ async def main() -> None:
...
@@ -1552,6 +1558,8 @@ async def main() -> None:
url
=
args
.
url
,
num_clients
=
args
.
num_clients
,
early_stop
=
not
args
.
no_early_stop
url
=
args
.
url
,
num_clients
=
args
.
num_clients
,
early_stop
=
not
args
.
no_early_stop
)
)
warmup_runtime_sec
:
float
|
None
=
None
# Warm-up step
# Warm-up step
if
args
.
warmup_step
:
if
args
.
warmup_step
:
# Only send a single user prompt from every conversation.
# Only send a single user prompt from every conversation.
...
@@ -1566,26 +1574,56 @@ async def main() -> None:
...
@@ -1566,26 +1574,56 @@ async def main() -> None:
# all clients should finish their work before exiting
# all clients should finish their work before exiting
warmup_bench_args
=
bench_args
.
_replace
(
early_stop
=
False
)
warmup_bench_args
=
bench_args
.
_replace
(
early_stop
=
False
)
logger
.
info
(
f
"
{
Color
.
PURPLE
}
Warmup start
{
Color
.
RESET
}
"
)
logger
.
info
(
"%sWarmup start%s"
,
Color
.
PURPLE
,
Color
.
RESET
)
warmup_start_ns
=
time
.
perf_counter_ns
()
conversations
,
_
=
await
main_mp
(
conversations
,
_
=
await
main_mp
(
warmup_client_args
,
req_args
,
warmup_bench_args
,
tokenizer
,
conversations
warmup_client_args
,
req_args
,
warmup_bench_args
,
tokenizer
,
conversations
)
)
logger
.
info
(
f
"
{
Color
.
PURPLE
}
Warmup done
{
Color
.
RESET
}
"
)
warmup_runtime_sec
=
nanosec_to_sec
(
time
.
perf_counter_ns
()
-
warmup_start_ns
)
logger
.
info
(
"%sWarmup runtime: %.3f sec (%.3f ms)%s"
,
Color
.
PURPLE
,
warmup_runtime_sec
,
warmup_runtime_sec
*
1000
,
Color
.
RESET
,
)
logger
.
info
(
"%sWarmup done%s"
,
Color
.
PURPLE
,
Color
.
RESET
)
# Run the benchmark
# Run the benchmark
start_
time
=
time
.
perf_counter_ns
()
benchmark_
start_
ns
=
time
.
perf_counter_ns
()
client_convs
,
client_metrics
=
await
main_mp
(
client_convs
,
client_metrics
=
await
main_mp
(
client_args
,
req_args
,
bench_args
,
tokenizer
,
conversations
client_args
,
req_args
,
bench_args
,
tokenizer
,
conversations
)
)
total
_runtime_
m
s
=
nanosec_to_
milli
sec
(
time
.
perf_counter_ns
()
-
start_
time
)
benchmark
_runtime_s
ec
=
nanosec_to_sec
(
time
.
perf_counter_ns
()
-
benchmark_
start_
ns
)
# Calculate requests per second
# Calculate requests per second
total_runtime_sec
=
total_runtime_ms
/
1000.0
requests_per_sec
=
len
(
client_metrics
)
/
benchmark_runtime_sec
rps
=
len
(
clie
nt
_
me
trics
)
/
total
_runtime_sec
benchmark_ru
nt
i
me
_ms
=
benchmark
_runtime_sec
*
1000.0
logger
.
info
(
logger
.
info
(
f
"
{
Color
.
GREEN
}
All clients finished, total runtime:
{
total_runtime_sec
:.
3
f
}
sec"
"%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
f
" (
{
total_runtime_ms
:.
3
f
}
ms), requests per second:
{
rps
:.
3
f
}{
Color
.
RESET
}
"
"requests per second: %.3f%s"
,
Color
.
GREEN
,
benchmark_runtime_sec
,
benchmark_runtime_ms
,
requests_per_sec
,
Color
.
RESET
,
)
)
if
warmup_runtime_sec
is
not
None
:
total_runtime_sec
=
benchmark_runtime_sec
+
warmup_runtime_sec
logger
.
info
(
"%sWarmup runtime: %.3f sec (%.3f ms)%s"
,
Color
.
GREEN
,
warmup_runtime_sec
,
warmup_runtime_sec
*
1000
,
Color
.
RESET
,
)
logger
.
info
(
"%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s"
,
Color
.
GREEN
,
total_runtime_sec
,
total_runtime_sec
*
1000
,
Color
.
RESET
,
)
# Benchmark parameters
# Benchmark parameters
params
=
{
params
=
{
...
@@ -1610,6 +1648,7 @@ async def main() -> None:
...
@@ -1610,6 +1648,7 @@ async def main() -> None:
verbose
=
args
.
verbose
,
verbose
=
args
.
verbose
,
gen_conv_args
=
gen_conv_args
,
gen_conv_args
=
gen_conv_args
,
excel_output
=
args
.
excel_output
,
excel_output
=
args
.
excel_output
,
warmup_runtime_sec
=
warmup_runtime_sec
,
)
)
if
args
.
output_file
is
not
None
:
if
args
.
output_file
is
not
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment