Unverified Commit 1e4748c6 authored by Matthias Gehre's avatar Matthias Gehre Committed by GitHub
Browse files

[Bugfix] Fix `vllm bench serve` to count multimodal tokens in "total input tokens" (#38654)


Signed-off-by: default avatarMatthias Gehre <matthias.gehre@amd.com>
parent 6f786f2c
......@@ -237,6 +237,8 @@ async def async_request_openai_completions(
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
if (pt := usage.get("prompt_tokens")) is not None:
output.prompt_len = pt
if first_chunk_received:
output.success = True
else:
......@@ -358,6 +360,8 @@ async def async_request_openai_chat_completions(
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
if (pt := usage.get("prompt_tokens")) is not None:
output.prompt_len = pt
most_recent_timestamp = timestamp
......
......@@ -439,7 +439,7 @@ def calculate_metrics(
).input_ids
)
actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len
total_input += outputs[i].prompt_len
tpot = 0
if output_len > 1:
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment