[Bugfix] Fix `vllm bench serve` to count multimodal tokens in "total input tokens" (#38654)

Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>

[Bugfix] Fix `vllm bench serve` to count multimodal tokens in "total input tokens" (#38654)
Signed-off-by: Matthias Gehre <matthias.gehre@amd.com>
1e4748c6 · Matthias Gehre · GitHub · 6f786f2c · 1e4748c6 · 1e4748c6
Unverified Commit 1e4748c6 authored Apr 14, 2026 by Matthias Gehre Committed by GitHub Apr 14, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

vllm/benchmarks/lib/endpoint_request_func.py vllm/benchmarks/lib/endpoint_request_func.py +4 -0

vllm/benchmarks/serve.py vllm/benchmarks/serve.py +1 -1

No files found.
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -237,6 +237,8 @@ async def async_request_openai_completions(
                                generated_text += text or ""
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get("completion_tokens")
+                                if (pt := usage.get("prompt_tokens")) is not None:
+                                    output.prompt_len = pt
                if first_chunk_received:
                    output.success = True
                else:
@@ -358,6 +360,8 @@ async def async_request_openai_chat_completions(
                                generated_text += content or ""
                            elif usage := data.get("usage"):
                                output.output_tokens = usage.get("completion_tokens")
+                                if (pt := usage.get("prompt_tokens")) is not None:
+                                    output.prompt_len = pt
                            most_recent_timestamp = timestamp

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -439,7 +439,7 @@ def calculate_metrics(
                        ).input_ids
                    )
            actual_output_lens.append(output_len)
-            total_input += input_requests[i].prompt_len
+            total_input += outputs[i].prompt_len
            tpot = 0
            if output_len > 1:
                latency_minus_ttft = outputs[i].latency - outputs[i].ttft