fix(replay): drain traffic metrics only on throughput ticks (#8232)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix(replay): drain traffic metrics only on throughput ticks (#8232)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
b92c9593 · Hongkuan Zhou · GitHub · d96a2cf1 · b92c9593 · b92c9593
Unverified Commit b92c9593 authored Apr 15, 2026 by Hongkuan Zhou Committed by GitHub Apr 15, 2026
3 changed files
--- a/components/src/dynamo/planner/offline/replay_adapter.py
+++ b/components/src/dynamo/planner/offline/replay_adapter.py
@@ -369,7 +369,7 @@ class ReplayPlannerAdapter:

        traffic = None
        if tick.need_traffic_metrics:
-            t = result.get("traffic", {})
+            t = self._bridge.drain_traffic()
            duration_s = t.get("duration_s", 0.0)
            if duration_s > 0:
                traffic = TrafficObservation(

--- a/lib/bindings/python/rust/llm/replay.rs
+++ b/lib/bindings/python/rust/llm/replay.rs
@@ -1317,6 +1317,8 @@ impl PlannerReplayBridge {
    /// Advance the simulation to `until_ms` simulated time.
    ///
    /// Returns a dict with separate prefill/decode worker counts and FPM snapshots.
+    /// Traffic metrics are NOT included — call `drain_traffic()` explicitly on
+    /// throughput-scaling ticks only.
    fn advance_to(&mut self, py: Python<'_>, until_ms: f64) -> PyResult<PyObject> {
        let handle = self
            .handle
@@ -1324,19 +1326,12 @@ impl PlannerReplayBridge {
            .ok_or_else(|| PyException::new_err("bridge has been finalized"))?;

        let tick_data = handle.advance_to(until_ms).map_err(to_pyerr)?;
-        let (duration_s, num_req, avg_isl, avg_osl) = tick_data.traffic;

        let result = json!({
            "now_ms": tick_data.now_ms,
            "is_done": tick_data.is_done,
            "prefill_fpm_snapshots": fpm_snapshots_to_json(tick_data.prefill_fpm_snapshots),
            "decode_fpm_snapshots": fpm_snapshots_to_json(tick_data.decode_fpm_snapshots),
-            "traffic": {
-                "duration_s": duration_s,
-                "num_req": num_req,
-                "avg_isl": avg_isl,
-                "avg_osl": avg_osl,
-            },
            "active_prefill_count": tick_data.active_prefill_count,
            "active_decode_count": tick_data.active_decode_count,
            "total_prefill_count": tick_data.total_prefill_count,
@@ -1348,6 +1343,31 @@ impl PlannerReplayBridge {
            .map(|obj| obj.unbind())
    }

+    /// Drain accumulated traffic metrics since the last drain.
+    ///
+    /// Returns a dict with `duration_s`, `num_req`, `avg_isl`, `avg_osl`.
+    /// Call this only on throughput-scaling ticks so the observation window
+    /// covers the full `throughput_adjustment_interval`.
+    fn drain_traffic(&mut self, py: Python<'_>) -> PyResult<PyObject> {
+        let handle = self
+            .handle
+            .as_mut()
+            .ok_or_else(|| PyException::new_err("bridge has been finalized"))?;
+
+        let (duration_s, num_req, avg_isl, avg_osl) = handle.drain_traffic();
+
+        let result = json!({
+            "duration_s": duration_s,
+            "num_req": num_req,
+            "avg_isl": avg_isl,
+            "avg_osl": avg_osl,
+        });
+
+        pythonize(py, &result)
+            .map_err(to_pyerr)
+            .map(|obj| obj.unbind())
+    }
+
    /// Apply a scaling decision with separate prefill and decode targets.
    /// For agg mode, `target_prefill` is ignored (pass 0).
    fn apply_scaling(&mut self, target_prefill: usize, target_decode: usize) -> PyResult<()> {

--- a/lib/mocker/src/replay/planner_handle.rs
+++ b/lib/mocker/src/replay/planner_handle.rs
@@ -27,6 +27,11 @@ use crate::loadgen::Trace;
 ///
 /// For aggregated mode, prefill fields are 0 and all data is in decode fields
 /// (matching how the planner treats agg as a single decode-stage engine).
+///
+/// Traffic metrics are NOT included here — they accumulate across ticks and
+/// must be drained explicitly via [`PlannerReplayHandle::drain_traffic`] on
+/// throughput-scaling ticks only. Draining on every tick would discard data
+/// between the more frequent load-scaling ticks.
 pub struct PlannerTickData {
    /// Current simulated time in milliseconds.
    pub now_ms: f64,
@@ -36,8 +41,6 @@ pub struct PlannerTickData {
    pub prefill_fpm_snapshots: Vec<(usize, ForwardPassSnapshot)>,
    /// Decode (or agg) FPM snapshots since last tick: (worker_id, snapshot).
    pub decode_fpm_snapshots: Vec<(usize, ForwardPassSnapshot)>,
-    /// Traffic observation: (duration_s, num_req, avg_isl, avg_osl).
-    pub traffic: (f64, usize, f64, f64),
    /// Active prefill workers (0 for agg mode).
    pub active_prefill_count: usize,
    /// Active decode workers (or total active for agg mode).
@@ -120,18 +123,19 @@ impl PlannerReplayHandle {
    }

    /// Advance the simulation up to `until_ms`, collect metrics, return tick data.
+    ///
+    /// Traffic metrics are NOT drained here — call [`drain_traffic`] explicitly
+    /// on throughput-scaling ticks so the accumulator covers the full interval.
    pub fn advance_to(&mut self, until_ms: f64) -> Result<PlannerTickData> {
        match &mut self.runtime {
            RuntimeKind::Agg(rt) => {
                let is_done = rt.advance_to(until_ms)?;
                let fpm = rt.drain_fpm();
-                let traffic = rt.drain_traffic();
                Ok(PlannerTickData {
                    now_ms: rt.now_ms(),
                    is_done,
                    prefill_fpm_snapshots: Vec::new(),
                    decode_fpm_snapshots: fpm,
-                    traffic,
                    active_prefill_count: 0,
                    active_decode_count: rt.active_worker_count(),
                    total_prefill_count: 0,
@@ -142,13 +146,11 @@ impl PlannerReplayHandle {
                let is_done = rt.advance_to(until_ms)?;
                let prefill_fpm = rt.drain_prefill_fpm();
                let decode_fpm = rt.drain_decode_fpm();
-                let traffic = rt.drain_traffic();
                Ok(PlannerTickData {
                    now_ms: rt.now_ms(),
                    is_done,
                    prefill_fpm_snapshots: prefill_fpm,
                    decode_fpm_snapshots: decode_fpm,
-                    traffic,
                    active_prefill_count: rt.active_prefill_count(),
                    active_decode_count: rt.active_decode_count(),
                    total_prefill_count: rt.total_prefill_count(),
@@ -158,6 +160,18 @@ impl PlannerReplayHandle {
        }
    }

+    /// Drain accumulated traffic metrics since the last drain.
+    ///
+    /// Returns `(duration_s, num_req, avg_isl, avg_osl)`. Call this only on
+    /// throughput-scaling ticks so the window covers the full
+    /// `throughput_adjustment_interval`, not just the gap between load ticks.
+    pub fn drain_traffic(&mut self) -> (f64, usize, f64, f64) {
+        match &mut self.runtime {
+            RuntimeKind::Agg(rt) => rt.drain_traffic(),
+            RuntimeKind::Disagg(rt) => rt.drain_traffic(),
+        }
+    }
+
    /// Apply a scaling decision with separate prefill and decode targets.
    /// For agg mode, `target_prefill` is ignored.
    pub fn apply_scaling(&mut self, target_prefill: usize, target_decode: usize) -> Result<()> {