Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
013abde6
Unverified
Commit
013abde6
authored
Oct 16, 2025
by
kimbochen
Committed by
GitHub
Oct 16, 2025
Browse files
Adding Warmup to Benchmark Serving (#26943)
Signed-off-by:
Kimbo Chen
<
chentenghung@gmail.com
>
parent
a5464dcf
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
36 additions
and
1 deletion
+36
-1
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+36
-1
No files found.
vllm/benchmarks/serve.py
View file @
013abde6
...
@@ -478,6 +478,7 @@ async def benchmark(
...
@@ -478,6 +478,7 @@ async def benchmark(
request_rate
:
float
,
request_rate
:
float
,
burstiness
:
float
,
burstiness
:
float
,
disable_tqdm
:
bool
,
disable_tqdm
:
bool
,
num_warmups
:
int
,
profile
:
bool
,
profile
:
bool
,
selected_percentile_metrics
:
list
[
str
],
selected_percentile_metrics
:
list
[
str
],
selected_percentiles
:
list
[
float
],
selected_percentiles
:
list
[
float
],
...
@@ -559,10 +560,37 @@ async def benchmark(
...
@@ -559,10 +560,37 @@ async def benchmark(
f
"Error:
{
test_output
.
error
}
"
f
"Error:
{
test_output
.
error
}
"
)
)
else
:
else
:
print
(
"Initial test run completed.
Starting main benchmark run...
"
)
print
(
"Initial test run completed."
)
else
:
else
:
print
(
"Skipping endpoint ready check."
)
print
(
"Skipping endpoint ready check."
)
if
num_warmups
>
0
:
print
(
f
"Warming up with
{
num_warmups
}
requests..."
)
warmup_pbar
=
None
if
disable_tqdm
else
tqdm
(
total
=
num_warmups
)
warmup_semaphore
=
(
asyncio
.
Semaphore
(
max_concurrency
)
if
max_concurrency
else
contextlib
.
nullcontext
()
)
warmup_tasks
=
[]
async
def
warmup_limited_request_func
():
async
with
warmup_semaphore
:
return
await
request_func
(
request_func_input
=
test_input
,
session
=
session
,
pbar
=
warmup_pbar
)
for
_
in
range
(
num_warmups
):
request_task
=
asyncio
.
create_task
(
warmup_limited_request_func
())
warmup_tasks
.
append
(
request_task
)
_
=
await
asyncio
.
gather
(
*
warmup_tasks
)
if
warmup_pbar
is
not
None
:
warmup_pbar
.
close
()
print
(
"Warmup run completed."
)
print
(
"Starting main benchmark run..."
)
if
lora_modules
:
if
lora_modules
:
# For each input request, choose a LoRA module at random.
# For each input request, choose a LoRA module at random.
lora_modules
=
iter
(
lora_modules
=
iter
(
...
@@ -1029,6 +1057,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -1029,6 +1057,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Specify to disable tqdm progress bar."
,
help
=
"Specify to disable tqdm progress bar."
,
)
)
parser
.
add_argument
(
"--num-warmups"
,
type
=
int
,
default
=
0
,
help
=
"Number of warmup requests."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--profile"
,
"--profile"
,
action
=
"store_true"
,
action
=
"store_true"
,
...
@@ -1370,6 +1404,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
...
@@ -1370,6 +1404,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
request_rate
=
args
.
request_rate
,
request_rate
=
args
.
request_rate
,
burstiness
=
args
.
burstiness
,
burstiness
=
args
.
burstiness
,
disable_tqdm
=
args
.
disable_tqdm
,
disable_tqdm
=
args
.
disable_tqdm
,
num_warmups
=
args
.
num_warmups
,
profile
=
args
.
profile
,
profile
=
args
.
profile
,
selected_percentile_metrics
=
percentile_metrics
.
split
(
","
),
selected_percentile_metrics
=
percentile_metrics
.
split
(
","
),
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)],
selected_percentiles
=
[
float
(
p
)
for
p
in
args
.
metric_percentiles
.
split
(
","
)],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment