Unverified Commit 51ab3ccf authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Collect more metrics: num_requests_total (#2859)

parent 67008f4b
...@@ -601,7 +601,7 @@ class TokenizerManager: ...@@ -601,7 +601,7 @@ class TokenizerManager:
while not self.gracefully_exit: while not self.gracefully_exit:
await asyncio.sleep(5) await asyncio.sleep(5)
# drain requests # Drain requests
while True: while True:
remain_num_req = len(self.rid_to_state) remain_num_req = len(self.rid_to_state)
logger.info( logger.info(
...@@ -679,45 +679,7 @@ class TokenizerManager: ...@@ -679,45 +679,7 @@ class TokenizerManager:
state.event.set() state.event.set()
if self.enable_metrics: if self.enable_metrics:
completion_tokens = ( self.collect_metrics(state, recv_obj, i)
recv_obj.completion_tokens[i]
if getattr(recv_obj, "completion_tokens", None)
else 0
)
if state.first_token_time is None:
state.first_token_time = time.time()
self.metrics_collector.observe_time_to_first_token(
state.first_token_time - state.created_time
)
else:
if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time)
/ (completion_tokens - 1)
)
if state.finished:
self.metrics_collector.inc_prompt_tokens(
recv_obj.prompt_tokens[i]
)
self.metrics_collector.inc_generation_tokens(
completion_tokens
)
self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time
)
# Compute time_per_output_token for the non-streaming case
if (
hasattr(state.obj, "stream")
and not state.obj.stream
and completion_tokens >= 1
):
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time)
/ completion_tokens
)
elif isinstance(recv_obj, OpenSessionReqOutput): elif isinstance(recv_obj, OpenSessionReqOutput):
self.session_futures[recv_obj.session_id].set_result( self.session_futures[recv_obj.session_id].set_result(
recv_obj.session_id if recv_obj.success else None recv_obj.session_id if recv_obj.success else None
...@@ -820,6 +782,42 @@ class TokenizerManager: ...@@ -820,6 +782,42 @@ class TokenizerManager:
ret.append(None) ret.append(None)
return ret return ret
def collect_metrics(self, state: ReqState, recv_obj: BatchStrOut, i: int):
completion_tokens = (
recv_obj.completion_tokens[i]
if getattr(recv_obj, "completion_tokens", None)
else 0
)
if state.first_token_time is None:
state.first_token_time = time.time()
self.metrics_collector.observe_time_to_first_token(
state.first_token_time - state.created_time
)
else:
if completion_tokens >= 2:
# Compute time_per_output_token for the streaming case
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.first_token_time) / (completion_tokens - 1)
)
if state.finished:
self.metrics_collector.observe_one_finished_request(
recv_obj.prompt_tokens[i], completion_tokens
)
self.metrics_collector.observe_e2e_request_latency(
time.time() - state.created_time
)
# Compute time_per_output_token for the non-streaming case
if (
hasattr(state.obj, "stream")
and not state.obj.stream
and completion_tokens >= 1
):
self.metrics_collector.observe_time_per_output_token(
(time.time() - state.created_time) / completion_tokens
)
class SignalHandler: class SignalHandler:
def __init__(self, tokenizer_manager): def __init__(self, tokenizer_manager):
......
...@@ -109,6 +109,12 @@ class TokenizerMetricsCollector: ...@@ -109,6 +109,12 @@ class TokenizerMetricsCollector:
labelnames=labels.keys(), labelnames=labels.keys(),
) )
self.num_requests_total = Counter(
name="sglang:num_requests_total",
documentation="Number of requests processed.",
labelnames=labels.keys(),
)
self.histogram_time_to_first_token = Histogram( self.histogram_time_to_first_token = Histogram(
name="sglang:time_to_first_token_seconds", name="sglang:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.", documentation="Histogram of time to first token in seconds.",
...@@ -185,11 +191,10 @@ class TokenizerMetricsCollector: ...@@ -185,11 +191,10 @@ class TokenizerMetricsCollector:
# Convenience function for logging to counter. # Convenience function for logging to counter.
counter.labels(**self.labels).inc(data) counter.labels(**self.labels).inc(data)
def inc_prompt_tokens(self, value: int): def observe_one_finished_request(self, prompt_tokens: int, generation_tokens: int):
self._log_counter(self.prompt_tokens_total, value) self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
def inc_generation_tokens(self, value: int): self.num_requests_total.labels(**self.labels).inc(1)
self._log_counter(self.generation_tokens_total, value)
def observe_time_to_first_token(self, value: Union[float, int]): def observe_time_to_first_token(self, value: Union[float, int]):
self._log_histogram(self.histogram_time_to_first_token, value) self._log_histogram(self.histogram_time_to_first_token, value)
......
...@@ -59,6 +59,7 @@ class TestEnableMetrics(unittest.TestCase): ...@@ -59,6 +59,7 @@ class TestEnableMetrics(unittest.TestCase):
"sglang:func_latency_seconds", "sglang:func_latency_seconds",
"sglang:prompt_tokens_total", "sglang:prompt_tokens_total",
"sglang:generation_tokens_total", "sglang:generation_tokens_total",
"sglang:num_requests_total",
"sglang:time_to_first_token_seconds", "sglang:time_to_first_token_seconds",
"sglang:time_per_output_token_seconds", "sglang:time_per_output_token_seconds",
"sglang:e2e_request_latency_seconds", "sglang:e2e_request_latency_seconds",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment