Unverified Commit 339c69a2 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve the computation for time_per_output_token Prometheus metrics (#2674)

parent f7074700
......@@ -699,6 +699,7 @@ class TokenizerManager:
)
else:
if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time)
/ (completion_tokens - 1)
......@@ -714,7 +715,8 @@ class TokenizerManager:
self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time
)
if completion_tokens >= 1:
# Compute time_per_output_token for the non-streaming case
if not state.obj.stream and completion_tokens >= 1:
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time)
/ completion_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment