Unverified Commit 68be0f85 authored by Csrayz's avatar Csrayz Committed by GitHub
Browse files

[Metrics] Add request_id to FinishedRequestStats to enable correlation between...


[Metrics] Add request_id to FinishedRequestStats to enable correlation between metrics and requests (#39710)

Enables external `StatLogger` plugins to correlate per-request metrics
with request-level context. Also, this is a pre-requisite for Prometheus
exemplars in #30972.
Signed-off-by: default avatarCsrayz <33659823+Csrayz@users.noreply.github.com>
parent 60995c05
...@@ -26,6 +26,7 @@ def test_prefill_kv_computed_with_cache(): ...@@ -26,6 +26,7 @@ def test_prefill_kv_computed_with_cache():
# Case 1: With prefix cache (1200 tokens cached) # Case 1: With prefix cache (1200 tokens cached)
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP, finish_reason=FinishReason.STOP,
request_id="test-req-001",
num_prompt_tokens=10000, num_prompt_tokens=10000,
max_tokens_param=100, max_tokens_param=100,
req_stats=req_stats, req_stats=req_stats,
...@@ -35,6 +36,7 @@ def test_prefill_kv_computed_with_cache(): ...@@ -35,6 +36,7 @@ def test_prefill_kv_computed_with_cache():
finished_req = iteration_stats.finished_requests[0] finished_req = iteration_stats.finished_requests[0]
assert finished_req.num_prompt_tokens == 10000 assert finished_req.num_prompt_tokens == 10000
assert finished_req.num_cached_tokens == 1200 assert finished_req.num_cached_tokens == 1200
assert finished_req.request_id == "test-req-001"
# Verify calculation: prefill KV = prompt tokens - cached tokens # Verify calculation: prefill KV = prompt tokens - cached tokens
prefill_kv_computed = finished_req.num_prompt_tokens - max( prefill_kv_computed = finished_req.num_prompt_tokens - max(
...@@ -55,6 +57,7 @@ def test_prefill_kv_computed_no_cache(): ...@@ -55,6 +57,7 @@ def test_prefill_kv_computed_no_cache():
# Case 2: No prefix cache # Case 2: No prefix cache
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP, finish_reason=FinishReason.STOP,
request_id="test-req-002",
num_prompt_tokens=2000, num_prompt_tokens=2000,
max_tokens_param=100, max_tokens_param=100,
req_stats=req_stats, req_stats=req_stats,
...@@ -64,6 +67,7 @@ def test_prefill_kv_computed_no_cache(): ...@@ -64,6 +67,7 @@ def test_prefill_kv_computed_no_cache():
finished_req = iteration_stats.finished_requests[0] finished_req = iteration_stats.finished_requests[0]
assert finished_req.num_prompt_tokens == 2000 assert finished_req.num_prompt_tokens == 2000
assert finished_req.num_cached_tokens == 0 assert finished_req.num_cached_tokens == 0
assert finished_req.request_id == "test-req-002"
# Verify calculation: prefill KV = full prompt when no cache # Verify calculation: prefill KV = full prompt when no cache
prefill_kv_computed = finished_req.num_prompt_tokens - max( prefill_kv_computed = finished_req.num_prompt_tokens - max(
...@@ -84,6 +88,7 @@ def test_prefill_kv_computed_edge_cases(): ...@@ -84,6 +88,7 @@ def test_prefill_kv_computed_edge_cases():
# Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully) # Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully)
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
finish_reason=FinishReason.STOP, finish_reason=FinishReason.STOP,
request_id="test-req-003",
num_prompt_tokens=100, num_prompt_tokens=100,
max_tokens_param=10, max_tokens_param=10,
req_stats=req_stats, req_stats=req_stats,
...@@ -96,11 +101,13 @@ def test_prefill_kv_computed_edge_cases(): ...@@ -96,11 +101,13 @@ def test_prefill_kv_computed_edge_cases():
finished_req.num_cached_tokens, 0 finished_req.num_cached_tokens, 0
) )
assert prefill_kv_computed == 100 # Should treat negative as 0 assert prefill_kv_computed == 100 # Should treat negative as 0
assert finished_req.request_id == "test-req-003"
# Case 4: All tokens cached (shouldn't happen in practice) # Case 4: All tokens cached (shouldn't happen in practice)
iteration_stats2 = IterationStats() iteration_stats2 = IterationStats()
iteration_stats2.update_from_finished_request( iteration_stats2.update_from_finished_request(
finish_reason=FinishReason.STOP, finish_reason=FinishReason.STOP,
request_id="test-req-004",
num_prompt_tokens=100, num_prompt_tokens=100,
max_tokens_param=10, max_tokens_param=10,
req_stats=req_stats, req_stats=req_stats,
...@@ -112,6 +119,7 @@ def test_prefill_kv_computed_edge_cases(): ...@@ -112,6 +119,7 @@ def test_prefill_kv_computed_edge_cases():
finished_req2.num_cached_tokens, 0 finished_req2.num_cached_tokens, 0
) )
assert prefill_kv_computed2 == 0 # All cached, nothing computed assert prefill_kv_computed2 == 0 # All cached, nothing computed
assert finished_req2.request_id == "test-req-004"
def test_prompt_token_stats_all_computed(): def test_prompt_token_stats_all_computed():
......
...@@ -799,6 +799,7 @@ class OutputProcessor: ...@@ -799,6 +799,7 @@ class OutputProcessor:
assert req_state.stats is not None assert req_state.stats is not None
iteration_stats.update_from_finished_request( iteration_stats.update_from_finished_request(
finish_reason=finish_reason, finish_reason=finish_reason,
request_id=req_state.external_req_id,
num_prompt_tokens=req_state.prompt_len, num_prompt_tokens=req_state.prompt_len,
max_tokens_param=req_state.max_tokens_param, max_tokens_param=req_state.max_tokens_param,
req_stats=req_state.stats, req_stats=req_state.stats,
......
...@@ -225,6 +225,7 @@ class FinishedRequestStats: ...@@ -225,6 +225,7 @@ class FinishedRequestStats:
"""Stats associated with a finished request.""" """Stats associated with a finished request."""
finish_reason: "FinishReason" finish_reason: "FinishReason"
request_id: str | None = None
e2e_latency: float = 0.0 e2e_latency: float = 0.0
num_prompt_tokens: int = 0 num_prompt_tokens: int = 0
num_generation_tokens: int = 0 num_generation_tokens: int = 0
...@@ -427,6 +428,7 @@ class IterationStats: ...@@ -427,6 +428,7 @@ class IterationStats:
def update_from_finished_request( def update_from_finished_request(
self, self,
finish_reason: "FinishReason", finish_reason: "FinishReason",
request_id: str,
num_prompt_tokens: int, num_prompt_tokens: int,
max_tokens_param: int | None, max_tokens_param: int | None,
req_stats: RequestStateStats, req_stats: RequestStateStats,
...@@ -458,6 +460,7 @@ class IterationStats: ...@@ -458,6 +460,7 @@ class IterationStats:
finished_req = FinishedRequestStats( finished_req = FinishedRequestStats(
finish_reason=finish_reason, finish_reason=finish_reason,
request_id=request_id,
e2e_latency=e2e_latency, e2e_latency=e2e_latency,
num_prompt_tokens=num_prompt_tokens, num_prompt_tokens=num_prompt_tokens,
num_generation_tokens=req_stats.num_generation_tokens, num_generation_tokens=req_stats.num_generation_tokens,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment