Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
965c5f49
Unverified
Commit
965c5f49
authored
Oct 17, 2025
by
Tomas Ruiz
Committed by
GitHub
Oct 16, 2025
Browse files
vllm bench serve shows num of failed requests (#26478)
Signed-off-by:
Tomas Ruiz
<
tomas.ruiz.te@gmail.com
>
parent
4d055ef4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
0 deletions
+10
-0
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+10
-0
No files found.
vllm/benchmarks/serve.py
View file @
965c5f49
...
...
@@ -64,6 +64,7 @@ class TaskType(Enum):
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
failed
:
int
total_input
:
int
total_output
:
int
request_throughput
:
float
...
...
@@ -97,6 +98,7 @@ class BenchmarkMetrics:
@
dataclass
class
EmbedBenchmarkMetrics
:
completed
:
int
failed
:
int
total_input
:
int
request_throughput
:
float
total_token_throughput
:
float
...
...
@@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
"""
total_input
=
0
completed
=
0
failed
=
0
e2els
:
list
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
total_input
+=
outputs
[
i
].
prompt_len
else
:
failed
+=
1
if
completed
==
0
:
warnings
.
warn
(
...
...
@@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
)
metrics
=
EmbedBenchmarkMetrics
(
completed
=
completed
,
failed
=
failed
,
total_input
=
total_input
,
request_throughput
=
completed
/
dur_s
,
total_token_throughput
=
total_input
/
dur_s
,
...
...
@@ -366,6 +372,7 @@ def calculate_metrics(
# Find the time range across all successful requests
successful_outputs
=
[
output
for
output
in
outputs
if
output
.
success
]
failed_outputs
=
[
output
for
output
in
outputs
if
not
output
.
success
]
if
successful_outputs
:
min_start_time
=
min
(
output
.
start_time
for
output
in
successful_outputs
)
max_end_time
=
max
(
...
...
@@ -427,6 +434,7 @@ def calculate_metrics(
metrics
=
BenchmarkMetrics
(
completed
=
completed
,
failed
=
len
(
failed_outputs
),
total_input
=
total_input
,
total_output
=
sum
(
actual_output_lens
),
request_throughput
=
completed
/
dur_s
,
...
...
@@ -734,6 +742,7 @@ async def benchmark(
print
(
"{s:{c}^{n}}"
.
format
(
s
=
" Serving Benchmark Result "
,
n
=
50
,
c
=
"="
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
print
(
"{:<40} {:<10}"
.
format
(
"Failed requests:"
,
metrics
.
failed
))
if
max_concurrency
is
not
None
:
print
(
"{:<40} {:<10}"
.
format
(
"Maximum request concurrency:"
,
max_concurrency
))
if
request_rate
!=
float
(
"inf"
):
...
...
@@ -779,6 +788,7 @@ async def benchmark(
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"failed"
:
metrics
.
failed
,
"total_input_tokens"
:
metrics
.
total_input
,
"total_output_tokens"
:
metrics
.
total_output
,
"request_throughput"
:
metrics
.
request_throughput
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment