feat(planner/replay): KV reuse awareness in load + throughput scaling (#8314)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

feat(planner/replay): KV reuse awareness in load + throughput scaling (#8314)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
c388483a · Hongkuan Zhou · GitHub · db124db0 · c388483a · c388483a
Unverified Commit c388483a authored Apr 20, 2026 by Hongkuan Zhou Committed by GitHub Apr 20, 2026
Showing with 10 additions and 2 deletions

lib/mocker/src/replay/planner_handle.rs lib/mocker/src/replay/planner_handle.rs +7 -2

lib/runtime/src/metrics/prometheus_names.rs lib/runtime/src/metrics/prometheus_names.rs +3 -0

No files found.
--- a/lib/mocker/src/replay/planner_handle.rs
+++ b/lib/mocker/src/replay/planner_handle.rs
@@ -162,8 +162,13 @@ impl PlannerReplayHandle {

    /// Drain accumulated traffic metrics since the last drain.
    ///
-    /// Call this only on throughput-scaling ticks so the window covers the full
-    /// `throughput_adjustment_interval`, not just the gap between load ticks.
+    /// Call this only on throughput-scaling ticks so the window covers the
+    /// full `throughput_adjustment_interval`, not just the gap between load
+    /// ticks. The returned [`TrafficStats::avg_kv_hit_rate`] is the
+    /// arithmetic mean of per-request ``overlap / isl`` ratios across
+    /// admissions in the window — matching the real router's per-request
+    /// Prometheus histogram, where each request contributes one sample
+    /// regardless of ISL size.
    pub fn drain_traffic(&mut self) -> TrafficStats {
        match &mut self.runtime {
            RuntimeKind::Agg(rt) => rt.drain_traffic(),

--- a/lib/runtime/src/metrics/prometheus_names.rs
+++ b/lib/runtime/src/metrics/prometheus_names.rs
@@ -540,6 +540,9 @@ pub mod router {

    /// Output sequence length in tokens observed at the router
    pub const OUTPUT_SEQUENCE_TOKENS: &str = "router_output_sequence_tokens";
+
+    /// Predicted KV cache hit rate at routing time (0.0-1.0)
+    pub const KV_HIT_RATE: &str = "router_kv_hit_rate";
 }

 /// Frontend pipeline stage and event-loop metrics