Unverified Commit c388483a authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat(planner/replay): KV reuse awareness in load + throughput scaling (#8314)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.7 (1M context) <noreply@anthropic.com>
parent db124db0
......@@ -162,8 +162,13 @@ impl PlannerReplayHandle {
/// Drain accumulated traffic metrics since the last drain.
///
/// Call this only on throughput-scaling ticks so the window covers the full
/// `throughput_adjustment_interval`, not just the gap between load ticks.
/// Call this only on throughput-scaling ticks so the window covers the
/// full `throughput_adjustment_interval`, not just the gap between load
/// ticks. The returned [`TrafficStats::avg_kv_hit_rate`] is the
/// arithmetic mean of per-request ``overlap / isl`` ratios across
/// admissions in the window — matching the real router's per-request
/// Prometheus histogram, where each request contributes one sample
/// regardless of ISL size.
pub fn drain_traffic(&mut self) -> TrafficStats {
match &mut self.runtime {
RuntimeKind::Agg(rt) => rt.drain_traffic(),
......
......@@ -540,6 +540,9 @@ pub mod router {
/// Output sequence length in tokens observed at the router
pub const OUTPUT_SEQUENCE_TOKENS: &str = "router_output_sequence_tokens";
/// Predicted KV cache hit rate at routing time (0.0-1.0)
pub const KV_HIT_RATE: &str = "router_kv_hit_rate";
}
/// Frontend pipeline stage and event-loop metrics
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment