Unverified Commit 339c69a2 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve the computation for time_per_output_token Prometheus metrics (#2674)

parent f7074700
...@@ -699,6 +699,7 @@ class TokenizerManager: ...@@ -699,6 +699,7 @@ class TokenizerManager:
) )
else: else:
if completion_tokens >= 2: if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token( self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time) (time.time() - state.first_token_time)
/ (completion_tokens - 1) / (completion_tokens - 1)
...@@ -714,7 +715,8 @@ class TokenizerManager: ...@@ -714,7 +715,8 @@ class TokenizerManager:
self.metrics_collector.observe_e2e_request_latency( self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time time.time() - state.created_time
) )
if completion_tokens >= 1: # Compute time_per_output_token for the non-streaming case
if not state.obj.stream and completion_tokens >= 1:
self.metrics_collector.observe_time_per_output_token( self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time) (time.time() - state.created_time)
/ completion_tokens / completion_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment