vllm bench serve shows num of failed requests (#26478)

Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>

vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
965c5f49 · Tomas Ruiz · GitHub · 4d055ef4 · 965c5f49
Unverified Commit 965c5f49 authored Oct 17, 2025 by Tomas Ruiz Committed by GitHub Oct 16, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 0 deletions

vllm/benchmarks/serve.py vllm/benchmarks/serve.py +10 -0

No files found.
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -64,6 +64,7 @@ class TaskType(Enum):
 @dataclass
 class BenchmarkMetrics:
    completed: int
+    failed: int
    total_input: int
    total_output: int
    request_throughput: float
@@ -97,6 +98,7 @@ class BenchmarkMetrics:
 @dataclass
 class EmbedBenchmarkMetrics:
    completed: int
+    failed: int
    total_input: int
    request_throughput: float
    total_token_throughput: float
@@ -239,12 +241,15 @@ def calculate_metrics_for_embeddings(
    """
    total_input = 0
    completed = 0
+    failed = 0
    e2els: list[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            e2els.append(outputs[i].latency)
            completed += 1
            total_input += outputs[i].prompt_len
+        else:
+            failed += 1

    if completed == 0:
        warnings.warn(
@@ -254,6 +259,7 @@ def calculate_metrics_for_embeddings(
        )
    metrics = EmbedBenchmarkMetrics(
        completed=completed,
+        failed=failed,
        total_input=total_input,
        request_throughput=completed / dur_s,
        total_token_throughput=total_input / dur_s,
@@ -366,6 +372,7 @@ def calculate_metrics(

    # Find the time range across all successful requests
    successful_outputs = [output for output in outputs if output.success]
+    failed_outputs = [output for output in outputs if not output.success]
    if successful_outputs:
        min_start_time = min(output.start_time for output in successful_outputs)
        max_end_time = max(
@@ -427,6 +434,7 @@ def calculate_metrics(

    metrics = BenchmarkMetrics(
        completed=completed,
+        failed=len(failed_outputs),
        total_input=total_input,
        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
@@ -734,6 +742,7 @@ async def benchmark(

    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10}".format("Failed requests:", metrics.failed))
    if max_concurrency is not None:
        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
    if request_rate != float("inf"):
@@ -779,6 +788,7 @@ async def benchmark(
        result = {
            "duration": benchmark_duration,
            "completed": metrics.completed,
+            "failed": metrics.failed,
            "total_input_tokens": metrics.total_input,
            "total_output_tokens": metrics.total_output,
            "request_throughput": metrics.request_throughput,