feat: expose predicted KV hit rate as Prometheus histogram (#6507)

Signed-off-by: Pea Brane <peabrane@peabrane.com> Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

feat: expose predicted KV hit rate as Prometheus histogram (#6507)
Signed-off-by: Pea Brane <peabrane@peabrane.com> Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
ddeee88b · Yan Ru Pei · GitHub · 32622b1c · ddeee88b · ddeee88b
Unverified Commit ddeee88b authored Feb 23, 2026 by Yan Ru Pei Committed by GitHub Feb 24, 2026
3 changed files
--- a/docs/pages/observability/metrics.md
+++ b/docs/pages/observability/metrics.md
@@ -217,6 +217,38 @@ Suppose the backend allows 3 concurrent requests and there are 10 clients contin
 - **HTTP Queue**: Measures queuing time before processing begins (including prefill time)
 - **HTTP Queue ≤ Inflight** (HTTP queue is a subset of inflight time)

+### Router Metrics
+
+When using the KV cache router (`--router-mode kv`), the frontend exposes additional metrics for monitoring routing decisions and overhead. These metrics are defined in `lib/llm/src/kv_router/metrics.rs`.
+
+#### Per-Request Routing Overhead (`dynamo_routing_overhead_*`)
+
+Histograms (in milliseconds) tracking the time spent in each phase of the routing decision for every request. Exposed on the frontend port (default 8000) at `/metrics`.
+
+- `dynamo_routing_overhead_block_hashing_ms`: Time computing block hashes from input tokens
+- `dynamo_routing_overhead_indexer_find_matches_ms`: Time in the KV indexer finding prefix matches
+- `dynamo_routing_overhead_seq_hashing_ms`: Time computing sequence hashes for active block tracking
+- `dynamo_routing_overhead_scheduling_ms`: Time in the scheduler selecting a worker (includes channel round-trip and load-aware selection)
+- `dynamo_routing_overhead_total_ms`: Total routing overhead per request (sum of all phases above)
+
+#### Per-Worker Load (`dynamo_frontend_worker_active_*`)
+
+Gauges tracking the active load on each worker, labeled by `worker_id`, `dp_rank`, and `worker_type`. Exposed on the frontend port at `/metrics`.
+
+- `dynamo_frontend_worker_active_decode_blocks`: Active KV cache decode blocks per worker
+- `dynamo_frontend_worker_active_prefill_tokens`: Active prefill tokens queued per worker
+
+#### Router Request Metrics (`dynamo_component_router_*`)
+
+Component-scoped histograms and counters for aggregate request-level statistics. These use the `dynamo_component_*` prefix with standard component labels (`dynamo_namespace`, `dynamo_component`, `dynamo_endpoint`). The `dynamo_component` label is set to the frontend component name. Exposed on the frontend port at `/metrics`.
+
+- `dynamo_component_router_requests_total`: Total requests processed by the router (counter)
+- `dynamo_component_router_time_to_first_token_seconds`: Time to first token as observed at the router (histogram)
+- `dynamo_component_router_inter_token_latency_seconds`: Average inter-token latency at the router (histogram)
+- `dynamo_component_router_input_sequence_tokens`: Input sequence length in tokens (histogram)
+- `dynamo_component_router_output_sequence_tokens`: Output sequence length in tokens (histogram)
+- `dynamo_component_router_kv_hit_rate`: Predicted KV cache hit rate at routing time, 0.0-1.0 (histogram)
+
 ## Related Documentation

 - [Distributed Runtime Architecture](../design-docs/distributed-runtime.md)

--- a/lib/llm/src/kv_router/metrics.rs
+++ b/lib/llm/src/kv_router/metrics.rs
@@ -6,6 +6,9 @@
 //! This module centralizes all router-side Prometheus metric definitions:
 //! - [`WorkerLoadMetrics`]: Per-worker active decode blocks and prefill tokens gauges.
 //! - [`RoutingOverheadMetrics`]: Per-request routing phase latency histograms.
+//! - [`RouterRequestMetrics`]: Per-request aggregate histograms (TTFT, ITL, tokens, KV hit rate).
+//!
+//! See also: `docs/pages/observability/metrics.md` (Router Metrics section).

 use std::sync::{Arc, LazyLock, OnceLock};
 use std::time::Duration;
@@ -191,6 +194,7 @@ pub struct RouterRequestMetrics {
    pub inter_token_latency_seconds: prometheus::Histogram,
    pub input_sequence_tokens: prometheus::Histogram,
    pub output_sequence_tokens: prometheus::Histogram,
+    pub kv_hit_rate: prometheus::Histogram,
 }

 static ROUTER_REQUEST_METRICS: OnceLock<Arc<RouterRequestMetrics>> = OnceLock::new();
@@ -202,6 +206,7 @@ impl RouterRequestMetrics {
        inter_token_latency_seconds: prometheus::Histogram,
        input_sequence_tokens: prometheus::Histogram,
        output_sequence_tokens: prometheus::Histogram,
+        kv_hit_rate: prometheus::Histogram,
    ) -> Self {
        Self {
            requests_total,
@@ -209,9 +214,13 @@ impl RouterRequestMetrics {
            inter_token_latency_seconds,
            input_sequence_tokens,
            output_sequence_tokens,
+            kv_hit_rate,
        }
    }

+    // TODO: move all `router_*` metric name strings to `prometheus_names.rs` constants
+    // for consistency with the other metric families (routing_overhead, frontend_service).
+
    /// Create from a Component, memoized in a static OnceLock.
    pub fn from_component(component: &Component) -> Arc<Self> {
        ROUTER_REQUEST_METRICS
@@ -256,12 +265,21 @@ impl RouterRequestMetrics {
                        Some(generate_log_buckets(50.0, 32000.0, 10)),
                    )
                    .expect("failed to create router_output_sequence_tokens");
+                let kv_hit_rate = metrics
+                    .create_histogram(
+                        "router_kv_hit_rate",
+                        "Predicted KV cache hit rate at routing time (0.0-1.0)",
+                        &[],
+                        Some(prometheus::linear_buckets(0.0, 0.05, 21).unwrap()),
+                    )
+                    .expect("failed to create router_kv_hit_rate");
                Arc::new(Self::new(
                    requests_total,
                    time_to_first_token_seconds,
                    inter_token_latency_seconds,
                    input_sequence_tokens,
                    output_sequence_tokens,
+                    kv_hit_rate,
                ))
            })
            .clone()

--- a/lib/llm/src/kv_router/push_router.rs
+++ b/lib/llm/src/kv_router/push_router.rs
@@ -380,6 +380,9 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
                overlap_amount as usize * block_size,
            );
            tracker.record_worker_full(instance_id, dp_rank, self.chooser.worker_type());
+            if let Some(hit_rate) = tracker.kv_hit_rate() {
+                request_metrics.kv_hit_rate.observe(hit_rate);
+            }
        }
        request_metrics
            .input_sequence_tokens