Unverified Commit 1e4748c6 authored by Matthias Gehre's avatar Matthias Gehre Committed by GitHub
Browse files

[Bugfix] Fix `vllm bench serve` to count multimodal tokens in "total input tokens" (#38654)


Signed-off-by: default avatarMatthias Gehre <matthias.gehre@amd.com>
parent 6f786f2c
...@@ -237,6 +237,8 @@ async def async_request_openai_completions( ...@@ -237,6 +237,8 @@ async def async_request_openai_completions(
generated_text += text or "" generated_text += text or ""
elif usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get("completion_tokens")
if (pt := usage.get("prompt_tokens")) is not None:
output.prompt_len = pt
if first_chunk_received: if first_chunk_received:
output.success = True output.success = True
else: else:
...@@ -358,6 +360,8 @@ async def async_request_openai_chat_completions( ...@@ -358,6 +360,8 @@ async def async_request_openai_chat_completions(
generated_text += content or "" generated_text += content or ""
elif usage := data.get("usage"): elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens") output.output_tokens = usage.get("completion_tokens")
if (pt := usage.get("prompt_tokens")) is not None:
output.prompt_len = pt
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
......
...@@ -439,7 +439,7 @@ def calculate_metrics( ...@@ -439,7 +439,7 @@ def calculate_metrics(
).input_ids ).input_ids
) )
actual_output_lens.append(output_len) actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len total_input += outputs[i].prompt_len
tpot = 0 tpot = 0
if output_len > 1: if output_len > 1:
latency_minus_ttft = outputs[i].latency - outputs[i].ttft latency_minus_ttft = outputs[i].latency - outputs[i].ttft
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment