feat(mocker): add offline trace replay mode [DYN-2502] (#7543)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(mocker): add offline trace replay mode [DYN-2502] (#7543)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
c3908a36 · Yan Ru Pei · GitHub · 6e56bad6 · c3908a36 · c3908a36
Unverified Commit c3908a36 authored Mar 20, 2026 by Yan Ru Pei Committed by GitHub Mar 20, 2026
20 changed files
--- a/components/src/dynamo/mocker/args.py
+++ b/components/src/dynamo/mocker/args.py
@@ -11,12 +11,6 @@ from pathlib import Path
 from dynamo.common.utils.namespace import get_worker_namespace
 from . import __version__
-from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
-from .utils.planner_profiler_perf_data_converter import (
-    convert_profile_results_to_npz,
-    is_mocker_format_npz,
-    is_profile_results_dir,
-)
 DYN_NAMESPACE = get_worker_namespace()
 DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.backend.generate"
@@ -62,6 +56,12 @@ def resolve_planner_profile_data(
    Raises:
        FileNotFoundError: If path doesn't contain valid profile data in any supported format.
    """
+    from .utils.planner_profiler_perf_data_converter import (
+        convert_profile_results_to_npz,
+        is_mocker_format_npz,
+        is_profile_results_dir,
+    )
    if planner_profile_data is None:
        return ProfileDataResult(npz_path=None, tmpdir=None)
@@ -216,7 +216,7 @@ def parse_bootstrap_ports(ports_str: str | None) -> list[int]:
    return [int(p.strip()) for p in ports_str.split(",")]
-def parse_args() -> argparse.Namespace:
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
    """Parse command-line arguments for the Dynamo mocker engine.
    Returns:
@@ -248,6 +248,24 @@ def parse_args() -> argparse.Namespace:
        default=None,
        help="Model name for API responses (default: derived from model-path)",
    )
+    parser.add_argument(
+        "--trace-file",
+        type=Path,
+        default=None,
+        help="Run offline trace replay from a Mooncake-style JSONL trace file.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=Path,
+        default=None,
+        help="Write replay metrics JSON to this path. Defaults to a replay JSON next to the trace file.",
+    )
+    parser.add_argument(
+        "--replay-concurrency",
+        type=int,
+        default=None,
+        help="Run offline replay in closed-loop concurrency mode with this many in-flight requests.",
+    )
    # MockEngineArgs parameters (similar to vLLM style)
    parser.add_argument(
@@ -481,7 +499,7 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument(
        "--kv-transfer-bandwidth",
        type=float,
-        default=DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS,
+        default=_default_kv_transfer_bandwidth_gbps(),
        help="KV cache transfer bandwidth in GB/s for disaggregated serving latency simulation. "
        "Default: 64.0 (inter-node InfiniBand). Set to 0 to disable KV transfer delay. "
        "For intra-node NVLink, typical value is ~450.",
@@ -543,9 +561,12 @@ def parse_args() -> argparse.Namespace:
        help="Determines how events are published [nats|zmq]",
    )
-    args = parser.parse_args()
+    args = parser.parse_args(argv)
    validate_worker_type_args(args)
+    if args.replay_concurrency is not None and args.trace_file is None:
+        raise ValueError("--replay-concurrency requires --trace-file")
    # Validate num_workers
    if args.num_workers < 1:
        raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
@@ -587,5 +608,10 @@ def parse_args() -> argparse.Namespace:
        else:
            args.endpoint = DEFAULT_ENDPOINT
            logger.debug(f"Using default endpoint: {args.endpoint}")
    return args
+def _default_kv_transfer_bandwidth_gbps() -> float:
+    from .utils.kv_cache import DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
+    return DEFAULT_KV_TRANSFER_BANDWIDTH_GBPS
--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -30,6 +30,7 @@ from dynamo.llm import (
 from dynamo.runtime.logging import configure_dynamo_logging
 from .args import create_temp_engine_args_file, parse_args, resolve_planner_profile_data
+from .replay import run_trace_replay
 from .utils.kv_cache import compute_kv_bytes_per_token
 configure_dynamo_logging()
@@ -72,6 +73,33 @@ async def worker():
    while still sharing the same event loop and tokio runtime.
    """
    args = parse_args()
+    profile_data_result = None
+    # Offline replay does not need planner profile conversion or runtime setup.
+    if args.trace_file is not None:
+        if args.extra_engine_args:
+            extra_engine_args_path = args.extra_engine_args
+            logger.info(f"Using provided MockEngineArgs from {extra_engine_args_path}")
+        else:
+            extra_engine_args_path = create_temp_engine_args_file(args)
+            logger.info("Created MockEngineArgs from CLI arguments")
+        try:
+            run_trace_replay(
+                trace_file=args.trace_file,
+                output_file=args.output_file,
+                extra_engine_args=extra_engine_args_path,
+                num_workers=args.num_workers,
+                replay_concurrency=args.replay_concurrency,
+            )
+            return
+        finally:
+            if not args.extra_engine_args and extra_engine_args_path.exists():
+                try:
+                    extra_engine_args_path.unlink()
+                    logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
+                except Exception as e:
+                    logger.warning(f"Failed to clean up temporary file: {e}")
    # Resolve planner-profile-data: convert profile results dir to NPZ if needed
    profile_data_result = resolve_planner_profile_data(args.planner_profile_data)
@@ -87,25 +115,25 @@ async def worker():
        extra_engine_args_path = create_temp_engine_args_file(args)
        logger.info("Created MockEngineArgs from CLI arguments")
-    # Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
-    if args.num_workers > 1 and args.model_path:
-        await prefetch_model(args.model_path)
-    # Auto-compute kv_bytes_per_token from model config if not explicitly set
-    if args.kv_bytes_per_token is None and args.model_path:
-        args.kv_bytes_per_token = compute_kv_bytes_per_token(
-            args.model_path, args.kv_cache_dtype
-        )
-    # Inject kv_bytes_per_token into engine args JSON (computed after model prefetch)
-    if args.kv_bytes_per_token is not None and not args.extra_engine_args:
-        with open(extra_engine_args_path) as f:
-            engine_args = json.load(f)
-        engine_args["kv_bytes_per_token"] = args.kv_bytes_per_token
-        with open(extra_engine_args_path, "w") as f:
-            json.dump(engine_args, f, indent=2)
    try:
+        # Pre-fetch model once to avoid HuggingFace rate limiting when launching many workers
+        if args.num_workers > 1 and args.model_path:
+            await prefetch_model(args.model_path)
+        # Auto-compute kv_bytes_per_token from model config if not explicitly set
+        if args.kv_bytes_per_token is None and args.model_path:
+            args.kv_bytes_per_token = compute_kv_bytes_per_token(
+                args.model_path, args.kv_cache_dtype
+            )
+        # Inject kv_bytes_per_token into engine args JSON (computed after model prefetch)
+        if args.kv_bytes_per_token is not None and not args.extra_engine_args:
+            with open(extra_engine_args_path) as f:
+                engine_args = json.load(f)
+            engine_args["kv_bytes_per_token"] = args.kv_bytes_per_token
+            with open(extra_engine_args_path, "w") as f:
+                json.dump(engine_args, f, indent=2)
        logger.info(
            f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
        )
@@ -118,8 +146,8 @@ async def worker():
                logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
            except Exception as e:
                logger.warning(f"Failed to clean up temporary file: {e}")
+        if profile_data_result is not None:
-        del profile_data_result  # Triggers tmpdir cleanup via __del__
+            del profile_data_result  # Triggers tmpdir cleanup via __del__
 def compute_stagger_delay(num_workers: int, stagger_delay: float) -> float:

--- a/components/src/dynamo/mocker/replay.py
+++ b/components/src/dynamo/mocker/replay.py
+#  SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+import json
+from pathlib import Path
+from typing import Any
+from dynamo.llm import run_mocker_trace_replay
+def default_replay_output_path(trace_file: Path) -> Path:
+    return trace_file.with_name(f"{trace_file.stem}.replay.json")
+def format_table(headers: list[str], rows: list[list[str]]) -> str:
+    widths = [len(header) for header in headers]
+    for row in rows:
+        for idx, cell in enumerate(row):
+            widths[idx] = max(widths[idx], len(cell))
+    def format_row(row: list[str]) -> str:
+        return " | ".join(cell.ljust(widths[idx]) for idx, cell in enumerate(row))
+    separator = "-+-".join("-" * width for width in widths)
+    return "\n".join(
+        [format_row(headers), separator, *(format_row(row) for row in rows)]
+    )
+def format_ms(value: float | None) -> str:
+    if value is None:
+        return "-"
+    return f"{value:.3f}"
+def format_number(value: float | None) -> str:
+    if value is None:
+        return "-"
+    return f"{value:.3f}"
+def print_replay_summary(report: dict[str, Any], output_file: Path) -> None:
+    scalar_rows = [
+        ["Request count", str(report["num_requests"])],
+        ["Completed requests", str(report["completed_requests"])],
+        ["Virtual duration (ms)", f"{report['duration_ms']:.3f}"],
+        ["Wall time (ms)", f"{report['wall_time_ms']:.3f}"],
+        ["Input tokens", str(report["total_input_tokens"])],
+        ["Output tokens", str(report["total_output_tokens"])],
+        ["Request throughput (req/s)", f"{report['request_throughput_rps']:.3f}"],
+        ["Input throughput (tok/s)", f"{report['input_throughput_tok_s']:.3f}"],
+        ["Output throughput (tok/s)", f"{report['output_throughput_tok_s']:.3f}"],
+        ["Total throughput (tok/s)", f"{report['total_throughput_tok_s']:.3f}"],
+        ["Prefix cache reused ratio", f"{report['prefix_cache_reused_ratio']:.6f}"],
+    ]
+    latency_rows = [
+        [
+            "TTFT",
+            format_ms(report["mean_ttft_ms"]),
+            format_ms(report["min_ttft_ms"]),
+            format_ms(report["max_ttft_ms"]),
+            format_ms(report["p99_ttft_ms"]),
+            format_ms(report["p90_ttft_ms"]),
+            format_ms(report["median_ttft_ms"]),
+            format_ms(report["p75_ttft_ms"]),
+            format_ms(report["std_ttft_ms"]),
+        ],
+        [
+            "TTST",
+            format_ms(report["mean_ttst_ms"]),
+            format_ms(report["min_ttst_ms"]),
+            format_ms(report["max_ttst_ms"]),
+            format_ms(report["p99_ttst_ms"]),
+            format_ms(report["p90_ttst_ms"]),
+            format_ms(report["median_ttst_ms"]),
+            format_ms(report["p75_ttst_ms"]),
+            format_ms(report["std_ttst_ms"]),
+        ],
+        [
+            "TPOT",
+            format_ms(report["mean_tpot_ms"]),
+            format_ms(report["min_tpot_ms"]),
+            format_ms(report["max_tpot_ms"]),
+            format_ms(report["p99_tpot_ms"]),
+            format_ms(report["p90_tpot_ms"]),
+            format_ms(report["median_tpot_ms"]),
+            format_ms(report["p75_tpot_ms"]),
+            format_ms(report["std_tpot_ms"]),
+        ],
+        [
+            "ITL",
+            format_ms(report["mean_itl_ms"]),
+            format_ms(report["min_itl_ms"]),
+            format_ms(report["max_itl_ms"]),
+            format_ms(report["p99_itl_ms"]),
+            format_ms(report["p90_itl_ms"]),
+            format_ms(report["median_itl_ms"]),
+            format_ms(report["p75_itl_ms"]),
+            format_ms(report["std_itl_ms"]),
+        ],
+        [
+            "E2E latency",
+            format_ms(report["mean_e2e_latency_ms"]),
+            format_ms(report["min_e2e_latency_ms"]),
+            format_ms(report["max_e2e_latency_ms"]),
+            format_ms(report["p99_e2e_latency_ms"]),
+            format_ms(report["p90_e2e_latency_ms"]),
+            format_ms(report["median_e2e_latency_ms"]),
+            format_ms(report["p75_e2e_latency_ms"]),
+            format_ms(report["std_e2e_latency_ms"]),
+        ],
+        [
+            "Output TPS/User",
+            format_number(report["mean_output_token_throughput_per_user"]),
+            format_number(report["min_output_token_throughput_per_user"]),
+            format_number(report["max_output_token_throughput_per_user"]),
+            format_number(report["p99_output_token_throughput_per_user"]),
+            format_number(report["p90_output_token_throughput_per_user"]),
+            format_number(report["median_output_token_throughput_per_user"]),
+            format_number(report["p75_output_token_throughput_per_user"]),
+            format_number(report["std_output_token_throughput_per_user"]),
+        ],
+    ]
+    lines = [
+        "Replay Summary",
+        format_table(["Metric", "Value"], scalar_rows),
+        "",
+        format_table(
+            ["Metric", "avg", "min", "max", "p99", "p90", "p50", "p75", "std"],
+            latency_rows,
+        ),
+        f"JSON report: {output_file}",
+    ]
+    print("\n".join(lines))
+def write_replay_report(report: dict[str, Any], output_file: Path) -> None:
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_file, "w") as f:
+        json.dump(report, f, indent=2, sort_keys=True)
+def run_trace_replay(
+    trace_file: Path,
+    output_file: Path | None,
+    extra_engine_args: Path,
+    num_workers: int,
+    replay_concurrency: int | None,
+) -> None:
+    resolved_output_file = output_file or default_replay_output_path(trace_file)
+    report = run_mocker_trace_replay(
+        trace_file=trace_file,
+        extra_engine_args=extra_engine_args,
+        num_workers=num_workers,
+        replay_concurrency=replay_concurrency,
+    )
+    write_replay_report(report, resolved_output_file)
+    print_replay_summary(report, resolved_output_file)
--- a/docs/benchmarks/mocker-trace-replay.md
+++ b/docs/benchmarks/mocker-trace-replay.md
+---
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: Mocker Offline Trace Replay
+subtitle: Replay Mooncake-style traces offline without launching a runtime or router
+---
+This guide covers the mocker's offline trace replay mode, which replays a Mooncake-style JSONL trace directly through the mock scheduler and writes a metrics report. Unlike normal `dynamo.mocker` usage, this mode does not launch workers, register endpoints, or require NATS, etcd, or a frontend.
+Use this when you want to:
+- benchmark scheduler behavior from a saved trace
+- compare timing and cache behavior across mocker configurations
+- validate replay logic in CI without bringing up a distributed stack
+## Quick Start
+Run offline replay by passing `--trace-file`:
+```bash
+python -m dynamo.mocker \
+    --trace-file /path/to/mooncake_trace.jsonl \
+    --model-path Qwen/Qwen3-0.6B
+```
+This writes a JSON report next to the trace file by default:
+```text
+/path/to/mooncake_trace.replay.json
+```
+The CLI also prints a `Replay Summary` table to stdout with request counts, throughput, and latency statistics.
+## Input Format
+The trace file must be Mooncake-style JSONL. Each line should contain:
+- `timestamp` or `created_time`
+- `input_length` or `input_tokens`
+- `output_length` or `output_tokens`
+- `hash_ids`
+Example:
+```json
+{"timestamp": 0, "input_length": 6755, "output_length": 500, "hash_ids": [0, 1, 2, 3]}
+```
+The mocker synthesizes token blocks from `hash_ids` using the configured `--block-size`, so the replay block size should match the block size used when the trace was generated.
+## Modes
+### Fixed-Schedule Replay
+Default replay mode preserves the timestamps from the trace and simulates arrivals in virtual time:
+```bash
+python -m dynamo.mocker \
+    --trace-file /path/to/mooncake_trace.jsonl \
+    --model-path Qwen/Qwen3-0.6B \
+    --block-size 512
+```
+This is the right mode when you want deterministic replay of the original arrival pattern.
+### Closed-Loop Concurrency Replay
+Use `--replay-concurrency` to ignore trace arrival timing and keep a fixed number of requests in flight:
+```bash
+python -m dynamo.mocker \
+    --trace-file /path/to/mooncake_trace.jsonl \
+    --model-path Qwen/Qwen3-0.6B \
+    --block-size 512 \
+    --replay-concurrency 16
+```
+This mode is useful when you want to compare scheduler behavior under a fixed offered concurrency rather than the original trace schedule.
+## Output
+Use `--output-file` to override the default report location:
+```bash
+python -m dynamo.mocker \
+    --trace-file /path/to/mooncake_trace.jsonl \
+    --model-path Qwen/Qwen3-0.6B \
+    --output-file /tmp/replay-report.json
+```
+If `--output-file` is not set, the report path defaults to `<trace stem>.replay.json` in the same directory as the input trace.
+The report contains:
+- request counts
+- input and output token totals
+- virtual duration and wall-clock runtime
+- request and token throughput
+- prefix cache reuse ratio
+- TTFT, TTST, TPOT, ITL, and end-to-end latency summaries
+- output-token-throughput-per-user summaries
+## Replay Constraints
+Offline replay currently supports only this configuration:
+- `--num-workers 1`
+- aggregated mode
+- `--engine-type vllm`
+- `--data-parallel-size 1`
+If you violate those constraints, replay fails immediately with a validation error.
+## Practical Notes
+- `--replay-concurrency` requires `--trace-file`
+- `--speedup-ratio` still affects simulated timing
+- `--extra-engine-args` can be used to provide a full mocker config JSON instead of individual CLI flags
+- offline replay does not need planner runtime setup, router registration, or event transport
+## When To Use This vs AIPerf
+Use offline replay when:
+- you want a fast scheduler-only simulation
+- you want deterministic CI coverage of replay behavior
+- you do not need HTTP serving, frontend behavior, or network effects
+Use [Dynamo Benchmarking](benchmarking.md) when:
+- you want end-to-end benchmarking against a live endpoint
+- you need frontend, transport, or cluster-level behavior
+- you want AIPerf dashboards and endpoint-facing metrics
--- a/docs/index.yml
+++ b/docs/index.yml
@@ -101,6 +101,8 @@ navigation:
        path: components/kvbm/kvbm-guide.md
      - page: Dynamo Benchmarking
        path: benchmarks/benchmarking.md
+      - page: Mocker Offline Trace Replay
+        path: benchmarks/mocker-trace-replay.md
      - section: Testing
        contents:
          - page: Mocker

--- a/docs/mocker/mocker.md
+++ b/docs/mocker/mocker.md
@@ -73,6 +73,9 @@ python -m dynamo.mocker \
 | `--model-path` | Required | HuggingFace model ID or local path for tokenizer |
 | `--endpoint` | Auto-derived | Dynamo endpoint string. Defaults are namespace-dependent, and prefill workers use a different default endpoint than aggregated/decode workers |
 | `--model-name` | Derived from model-path | Model name for API responses |
+| `--trace-file` | None | Run offline trace replay from a Mooncake-style JSONL trace file |
+| `--output-file` | `<trace stem>.replay.json` | Write replay metrics JSON to this path |
+| `--replay-concurrency` | None | Run offline replay in closed-loop concurrency mode with this many in-flight requests |
 | `--num-gpu-blocks-override` | 16384 | Number of KV cache blocks |
 | `--block-size` | 64 | Tokens per KV cache block |
 | `--max-num-seqs` | 256 | Maximum concurrent sequences |
@@ -119,6 +122,20 @@ python -m dynamo.mocker \
 > **Note:** For local scale tests and router benchmarks, prefer `--num-workers` over launching many separate mocker processes. All workers share one tokio runtime and thread pool, which is both lighter weight and closer to how the test harnesses exercise the mocker.
+## Offline Trace Replay
+The mocker also supports an offline replay mode for Mooncake-style traces:
+```bash
+python -m dynamo.mocker \
+    --trace-file /path/to/mooncake_trace.jsonl \
+    --model-path Qwen/Qwen3-0.6B
+```
+This mode writes a replay report JSON and prints a `Replay Summary` table without launching a runtime or router.
+For full usage, constraints, and benchmarking guidance, see [Mocker Offline Trace Replay](../benchmarks/mocker-trace-replay.md).
 ## Performance Modeling Setup
 By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either:

--- a/lib/bench/kv_router/active_sequences_bench.rs
+++ b/lib/bench/kv_router/active_sequences_bench.rs
@@ -176,6 +176,7 @@ async fn generate_sequence_events(
                    max_output_tokens: worker_trace[i].output_length as usize,
                    uuid: Some(worker_trace[i].uuid),
                    dp_rank: 0,
+                    arrival_timestamp_ms: None,
                });
                i += 1;
@@ -187,6 +188,7 @@ async fn generate_sequence_events(
                        max_output_tokens: worker_trace[i].output_length as usize,
                        uuid: Some(worker_trace[i].uuid),
                        dp_rank: 0,
+                        arrival_timestamp_ms: None,
                    });
                    i += 1;
                }

--- a/lib/bench/kv_router/common/mod.rs
+++ b/lib/bench/kv_router/common/mod.rs
@@ -373,6 +373,7 @@ pub async fn generate_kv_events(
                    max_output_tokens: worker_trace[i].output_length as usize,
                    uuid: Some(worker_trace[i].uuid),
                    dp_rank: 0,
+                    arrival_timestamp_ms: None,
                });
                i += 1;
@@ -384,6 +385,7 @@ pub async fn generate_kv_events(
                        max_output_tokens: worker_trace[i].output_length as usize,
                        uuid: Some(worker_trace[i].uuid),
                        dp_rank: 0,
+                        arrival_timestamp_ms: None,
                    });
                    i += 1;
                }

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -149,6 +149,10 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(fetch_model, m)?)?;
    m.add_function(wrap_pyfunction!(run_kv_indexer, m)?)?;
    m.add_function(wrap_pyfunction!(llm::entrypoint::make_engine, m)?)?;
+    m.add_function(wrap_pyfunction!(
+        llm::entrypoint::run_mocker_trace_replay,
+        m
+    )?)?;
    m.add_function(wrap_pyfunction!(llm::entrypoint::run_input, m)?)?;
    m.add_class::<DistributedRuntime>()?;

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -9,6 +9,7 @@ use std::sync::Arc;
 use pyo3::{exceptions::PyException, prelude::*};
 use pyo3_async_runtimes::TaskLocals;
+use pythonize::pythonize;
 use dynamo_kv_router::config::KvRouterConfig as RsKvRouterConfig;
 use dynamo_llm::discovery::LoadThresholdConfig as RsLoadThresholdConfig;
@@ -467,6 +468,50 @@ pub fn run_input<'p>(
    })
 }
+#[pyfunction]
+#[pyo3(signature = (trace_file, extra_engine_args=None, num_workers=1, replay_concurrency=None))]
+pub fn run_mocker_trace_replay(
+    py: Python<'_>,
+    trace_file: PathBuf,
+    extra_engine_args: Option<PathBuf>,
+    num_workers: usize,
+    replay_concurrency: Option<isize>,
+) -> PyResult<PyObject> {
+    let report = py.allow_threads(move || {
+        let args = if let Some(extra_args_path) = extra_engine_args {
+            MockEngineArgs::from_json_file(&extra_args_path).map_err(|e| {
+                anyhow::anyhow!(
+                    "Failed to load mocker args from {:?}: {}",
+                    extra_args_path,
+                    e
+                )
+            })?
+        } else {
+            MockEngineArgs::default()
+        };
+        let replay_concurrency = replay_concurrency
+            .map(usize::try_from)
+            .transpose()
+            .map_err(|_| anyhow::anyhow!("replay_concurrency must be at least 1"))?;
+        if let Some(max_in_flight) = replay_concurrency {
+            dynamo_mocker::simulation::simulate_concurrency_file(
+                args,
+                &trace_file,
+                max_in_flight,
+                num_workers,
+            )
+        } else {
+            dynamo_mocker::simulation::simulate_trace_file(args, &trace_file, num_workers)
+        }
+    });
+    let report = report.map_err(to_pyerr)?;
+    pythonize(py, &report)
+        .map_err(to_pyerr)
+        .map(|obj| obj.unbind())
+}
 pub fn to_pyerr<E>(err: E) -> PyErr
 where
    E: Display,

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
+import os
 from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Tuple
 # Import from specialized modules
@@ -1246,6 +1247,15 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi
    """Start an engine, connect it to an input, and run until stopped."""
    ...
+def run_mocker_trace_replay(
+    trace_file: str | os.PathLike[str],
+    extra_engine_args: Optional[str | os.PathLike[str]] = None,
+    num_workers: int = 1,
+    replay_concurrency: Optional[int] = None,
+) -> Dict[str, Any]:
+    """Replay a mocker trace file and return the simulation report."""
+    ...
 class Layer:
    """
    A KV cache block layer

--- a/lib/bindings/python/src/dynamo/llm/__init__.py
+++ b/lib/bindings/python/src/dynamo/llm/__init__.py
@@ -35,6 +35,7 @@ from dynamo._core import make_engine
 from dynamo._core import register_model as register_model
 from dynamo._core import run_input
 from dynamo._core import run_kv_indexer as run_kv_indexer
+from dynamo._core import run_mocker_trace_replay
 from dynamo._core import unregister_model as unregister_model
 from .exceptions import HttpError

--- a/lib/kv-router/src/standalone_indexer/listener.rs
+++ b/lib/kv-router/src/standalone_indexer/listener.rs
@@ -11,7 +11,7 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use zeromq::{DealerSocket, Socket, SocketRecv, SocketSend, SubSocket};
-use crate::protocols::{RouterEvent, WorkerId, WorkerWithDpRank};
+use crate::protocols::{WorkerId, WorkerWithDpRank};
 use crate::zmq_wire::{KvEventBatch, convert_event};
 use super::indexer::Indexer;

--- a/lib/kv-router/src/standalone_indexer/registry.rs
+++ b/lib/kv-router/src/standalone_indexer/registry.rs
@@ -359,11 +359,16 @@ impl WorkerRegistry {
    #[cfg(feature = "metrics")]
    pub fn refresh_metrics(&self) {
        let models = self.indexers.len();
-        let mut workers = self.workers.len();
+        let workers = self.workers.len() + {
-        #[cfg(feature = "indexer-runtime")]
+            #[cfg(feature = "indexer-runtime")]
-        {
+            {
-            workers += self.discovered_workers.len();
+                self.discovered_workers.len()
-        }
+            }
+            #[cfg(not(feature = "indexer-runtime"))]
+            {
+                0
+            }
+        };
        let mut listener_counts = [0_i64; 4];
        for entry in self.workers.iter() {

--- a/lib/llm/src/mocker.rs
+++ b/lib/llm/src/mocker.rs
@@ -637,6 +637,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<LLMEngineOutput>, Error>
            max_output_tokens,
            uuid: Some(request_uuid),
            dp_rank,
+            arrival_timestamp_ms: request.request_timestamp_ms,
        };
        let (request_tx, mut request_rx) = mpsc::unbounded_channel::<OutputSignal>();

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -297,6 +297,7 @@ impl OpenAIPreprocessor {
        if let Some(nvext) = request.nvext() {
            // Build routing hints from nvext fields
            let hints = nvext.agent_hints.as_ref();
+            builder.request_timestamp_ms(nvext.request_timestamp_ms);
            let routing = RoutingHints {
                backend_instance_id: nvext.backend_instance_id,
                prefill_worker_id: nvext.prefill_worker_id,

--- a/lib/llm/src/protocols/common/preprocessor.rs
+++ b/lib/llm/src/protocols/common/preprocessor.rs
@@ -190,6 +190,11 @@ pub struct PreprocessedRequest {
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub extra_args: Option<serde_json::Value>,
+    /// Optional request timestamp in milliseconds forwarded from nvext.
+    #[builder(default)]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub request_timestamp_ms: Option<f64>,
    /// Optional request tracker for per-request metrics (shared with DeltaGenerator)
    #[builder(default)]
    #[serde(skip)]

--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -174,6 +174,11 @@ pub struct NvExt {
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub cache_control: Option<CacheControl>,
+    /// Optional request timestamp in milliseconds for trace replay / virtual-time simulation.
+    #[builder(default, setter(strip_option))]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub request_timestamp_ms: Option<f64>,
 }
 /// Hints from the agent/caller about request characteristics.

--- a/lib/mocker/src/common/perf_model.rs
+++ b/lib/mocker/src/common/perf_model.rs
@@ -227,8 +227,8 @@ impl PerfModel {
                decode_interp.interp(query_x, query_y).unwrap_or(0.0)
            }
        };
-        // Ensure non-negative timing
+        // Token-emitting decode steps should not collapse onto the same timestamp.
-        let result = time.max(0.0);
+        let result = time.max(1.0);
        tracing::trace!(
            "Decode time prediction: active_kv_tokens={active_kv_tokens}, context_length={context_length}, time={result:.2}ms"
        );

--- a/lib/mocker/src/common/protocols.rs
+++ b/lib/mocker/src/common/protocols.rs
@@ -53,6 +53,7 @@ pub struct DirectRequest {
    pub max_output_tokens: usize,
    pub uuid: Option<Uuid>,
    pub dp_rank: u32,
+    pub arrival_timestamp_ms: Option<f64>,
 }
 /// Represents the cost of prefilling content in the cache