feat(mocker): add offline disagg replay (#7617)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(mocker): add offline disagg replay (#7617)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
02b1c58a · Yan Ru Pei · GitHub · 4b8826b3 · 02b1c58a · 02b1c58a
Unverified Commit 02b1c58a authored Mar 25, 2026 by Yan Ru Pei Committed by GitHub Mar 25, 2026
20 changed files
--- a/components/src/dynamo/common/configuration/groups/kv_router_args.py
+++ b/components/src/dynamo/common/configuration/groups/kv_router_args.py
@@ -26,6 +26,7 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
    "router_track_active_blocks",
    "router_track_output_blocks",
    "router_assume_kv_reuse",
+    "router_track_prefill_tokens",
    "router_snapshot_threshold",
    "router_reset_states",
    "router_ttl_secs",
@@ -51,6 +52,7 @@ class KvRouterConfigBase(ConfigBase):
    router_track_active_blocks: bool
    router_track_output_blocks: bool
    router_assume_kv_reuse: bool
+    router_track_prefill_tokens: bool
    router_snapshot_threshold: int
    router_reset_states: bool
    router_ttl_secs: float
@@ -173,6 +175,18 @@ class KvRouterArgGroup(ArgGroup):
            ),
            obsolete_flag="--assume-kv-reuse",
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--router-track-prefill-tokens",
+            env_var="DYN_ROUTER_TRACK_PREFILL_TOKENS",
+            default=True,
+            dest="router_track_prefill_tokens",
+            help=(
+                "KV Router: Include prompt-side prefill tokens in active load accounting. "
+                "Use --no-router-track-prefill-tokens to ignore prompt tokens in router "
+                "prefill-token load, queue pressure, and active_prefill_tokens metrics."
+            ),
+        )
        add_argument(
            g,
            flag_name="--router-snapshot-threshold",

--- a/components/src/dynamo/mocker/aic_session.py
+++ b/components/src/dynamo/mocker/aic_session.py
@@ -14,7 +14,7 @@ from aiconfigurator.sdk import config
 from aiconfigurator.sdk.backends.factory import get_backend
 from aiconfigurator.sdk.inference_session import InferenceSession
 from aiconfigurator.sdk.models import get_model
-from aiconfigurator.sdk.perf_database import get_database
+from aiconfigurator.sdk.perf_database import get_database, get_supported_databases
 logger = logging.getLogger(__name__)
@@ -40,6 +40,14 @@ class AicSession:
        )
        database = get_database(system=system, backend=backend_name, version=version)
+        if database is None:
+            supported = get_supported_databases().get(system, {}).get(backend_name, [])
+            supported_versions = ", ".join(supported) if supported else "<none>"
+            raise RuntimeError(
+                "AIC perf database not found for "
+                f"system={system!r}, backend={backend_name!r}, version={version!r}. "
+                f"Supported versions for this system/backend: {supported_versions}"
+            )
        model_config = config.ModelConfig(tp_size=tp_size)
        model = get_model(
            model_path=model_path,

--- a/components/src/dynamo/mocker/config.py
+++ b/components/src/dynamo/mocker/config.py
@@ -42,6 +42,13 @@ def _build_sglang_args(args: argparse.Namespace) -> SglangArgs | None:
 def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
+    worker_type = (
+        "prefill"
+        if getattr(args, "is_prefill_worker", False)
+        else "decode"
+        if getattr(args, "is_decode_worker", False)
+        else "aggregated"
+    )
    aic_backend = None
    aic_system = None
    aic_backend_version = None
@@ -53,7 +60,6 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
        aic_backend_version = getattr(args, "aic_backend_version", None)
        aic_tp_size = getattr(args, "aic_tp_size", None)
        aic_model_path = getattr(args, "model_path", None)
    return MockEngineArgs(
        engine_type=getattr(args, "engine_type", None) or "vllm",
        num_gpu_blocks=getattr(args, "num_gpu_blocks", _DEFAULT_NUM_GPU_BLOCKS),
@@ -64,18 +70,12 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
        ),
        enable_prefix_caching=getattr(args, "enable_prefix_caching", True),
        enable_chunked_prefill=getattr(args, "enable_chunked_prefill", True),
-        preemption_mode=getattr(args, "preemption_mode", "lifo"),
        speedup_ratio=getattr(args, "speedup_ratio", 1.0),
        decode_speedup_ratio=getattr(args, "decode_speedup_ratio", 1.0),
        dp_size=getattr(args, "dp_size", 1),
        startup_time=getattr(args, "startup_time", None),
-        worker_type=(
+        worker_type=worker_type,
-            "prefill"
+        planner_profile_data=getattr(args, "planner_profile_data", None),
-            if getattr(args, "is_prefill_worker", False)
-            else "decode"
-            if getattr(args, "is_decode_worker", False)
-            else "aggregated"
-        ),
        aic_backend=aic_backend,
        aic_system=aic_system,
        aic_backend_version=aic_backend_version,
@@ -85,6 +85,7 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
        kv_transfer_bandwidth=getattr(args, "kv_transfer_bandwidth", None),
        reasoning=_parse_reasoning_config(getattr(args, "reasoning", None)),
        sglang=_build_sglang_args(args),
+        preemption_mode=getattr(args, "preemption_mode", "lifo"),
    )

--- a/components/src/dynamo/router/README.md
+++ b/components/src/dynamo/router/README.md
@@ -29,7 +29,7 @@ python -m dynamo.router \
 - `--endpoint`: Full endpoint path for workers in the format `namespace.component.endpoint` (e.g., `dynamo.prefill.generate`)
 **Router Configuration:**
-All router options use the `--router-*` prefix (e.g., `--router-block-size`, `--router-kv-overlap-score-weight`, `--router-temperature`, `--router-kv-events` / `--no-router-kv-events`, `--router-replica-sync`, `--router-snapshot-threshold`, `--router-reset-states`, `--router-track-active-blocks` / `--no-router-track-active-blocks`). Legacy names without the prefix (e.g., `--block-size`, `--kv-events`) are still accepted but deprecated. For detailed descriptions, see the [Router Guide](/docs/components/router/router-guide.md).
+All router options use the `--router-*` prefix (e.g., `--router-block-size`, `--router-kv-overlap-score-weight`, `--router-temperature`, `--router-kv-events` / `--no-router-kv-events`, `--router-replica-sync`, `--router-snapshot-threshold`, `--router-reset-states`, `--router-track-active-blocks` / `--no-router-track-active-blocks`, `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens`). Legacy names without the prefix (e.g., `--block-size`, `--kv-events`) are still accepted but deprecated. For detailed descriptions, see the [Router Guide](/docs/components/router/router-guide.md).
 ## Architecture
@@ -74,6 +74,9 @@ python -m dynamo.vllm --model MODEL_NAME --block-size 64 --disaggregation-mode p
 > **Why `--no-router-track-active-blocks` for prefill routing?**
 > Active block tracking is used for load balancing across decode (generation) phases. For prefill-only routing, decode load is not relevant, so disabling this reduces overhead and simplifies the router state.
 >
+> **When should I use `--no-router-track-prefill-tokens`?**
+> Use it on decode-only routers that should ignore already-completed prompt work. This keeps `active_prefill_tokens`, queue pressure, and load estimates focused on decode-side work after a prefill-to-decode handoff.
+>
 > **Why `--router-block-size` is required for standalone routers:**
 > Unlike the frontend router which can infer block size from the ModelDeploymentCard (MDC) during worker registration, standalone routers cannot access the MDC and must have the block size explicitly specified. This is a work in progress to enable automatic inference.

--- a/components/src/dynamo/router/__main__.py
+++ b/components/src/dynamo/router/__main__.py
@@ -171,6 +171,7 @@ async def worker(runtime: DistributedRuntime):
        f"router_track_active_blocks={config.router_track_active_blocks}, "
        f"router_track_output_blocks={config.router_track_output_blocks}, "
        f"router_assume_kv_reuse={config.router_assume_kv_reuse}, "
+        f"router_track_prefill_tokens={config.router_track_prefill_tokens}, "
        f"router_ttl_secs={config.router_ttl_secs}, "
        f"router_max_tree_size={config.router_max_tree_size}, "
        f"router_prune_target_ratio={config.router_prune_target_ratio}"

--- a/docs/benchmarks/mocker-trace-replay.md
+++ b/docs/benchmarks/mocker-trace-replay.md
@@ -117,6 +117,8 @@ The dedicated replay CLI exposes:
 - `--replay-mode offline|online`
 - `--router-mode round_robin|kv_router`
 - `--num-workers`
+- `--num-prefill-workers`
+- `--num-decode-workers`
 - `--replay-concurrency`
 - `--arrival-interval-ms`
 - `--arrival-speedup-ratio`
@@ -125,6 +127,8 @@ The dedicated replay CLI exposes:
 - `--num-prefix-groups`
 - `--inter-turn-delay-ms`
 - `--extra-engine-args` (JSON string)
+- `--prefill-engine-args` (JSON string)
+- `--decode-engine-args` (JSON string)
 - `--router-config` (JSON string)
 - `--report-json`
@@ -164,6 +168,19 @@ as `block_size`, `engine_type`, `dp_size`, `speedup_ratio`, and `decode_speedup_
 `--extra-engine-args`, not as top-level replay CLI flags. Unspecified fields fall back to the same
 defaults used by `MockEngineArgs::default()` and `KvRouterConfig::default()`.
+Offline disagg replay uses staged engine args instead of `--extra-engine-args`:
+- `--prefill-engine-args` for the prefill worker config
+- `--decode-engine-args` for the decode worker config
+- `--num-prefill-workers` and `--num-decode-workers` for pool sizes
+For offline disagg replay, the staged JSON must set `worker_type` explicitly:
+- `--prefill-engine-args` must use `worker_type: "prefill"`
+- `--decode-engine-args` must use `worker_type: "decode"`
+The staged configs must also use the same `block_size`.
 ### Synthetic Replay
 Synthetic replay bypasses trace loading and generates in-memory requests with fixed input/output
@@ -320,9 +337,12 @@ If `--report-json` is not provided, `python -m dynamo.replay` writes a timestamp
 Shared replay constraints:
- aggregated mode
 - `extra_engine_args.engine_type` must be `vllm` or `sglang`
- `extra_engine_args.dp_size` must be `1`
+- aggregated replay requires the existing aggregated args path
+- disagg replay requires both `prefill_engine_args` and `decode_engine_args`
+- disagg replay requires `router_mode=kv_router`
+- replay `dp_size` must be `1`
+- disagg replay requires matching `block_size` in `prefill_engine_args` and `decode_engine_args`
 Additional offline constraints:
@@ -330,6 +350,7 @@ Additional offline constraints:
 - single-worker offline replay is still a dedicated fast path for `vllm`, but it now supports both
  flat request replay and workload-driven multi-turn replay
 - `sglang` still goes through the shared multi-worker replay runtime even when `num_workers=1`
+- offline disagg replay is a separate two-stage runtime with prefill and decode worker pools
 Additional online constraints:
@@ -343,12 +364,13 @@ If you violate those constraints, replay fails immediately with a validation err
  either a trace file, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
 - `--replay-concurrency` works with both trace replay and synthetic replay
 - mocker compute-speed knobs such as `speedup_ratio` still affect simulated timing when passed via
-  `--extra-engine-args`
+  the engine-args JSON for the chosen replay mode
 - `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
 - `--arrival-interval-ms` only applies to synthetic replay
 - `--turns-per-session`, `--shared-prefix-ratio`, `--num-prefix-groups`, and
  `--inter-turn-delay-ms` only apply to synthetic replay
- `--extra-engine-args` and `--router-config` are JSON strings on the standalone replay CLI
+- `--extra-engine-args`, `--prefill-engine-args`, `--decode-engine-args`, and `--router-config`
+  are JSON strings on the standalone replay CLI
 - offline replay does not need planner runtime setup, router registration, or external event transport
 - the replay block size should match the trace block size, because token synthesis expands `hash_ids`
  using the configured block size

--- a/docs/components/router/README.md
+++ b/docs/components/router/README.md
@@ -23,6 +23,7 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa
 | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
 | `--router-queue-threshold` | `4.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
 | `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
+| `--no-router-track-prefill-tokens` | disabled | Ignore prompt-side prefill tokens in router load accounting; useful for decode-only routing paths |
 ### Standalone Router

--- a/docs/components/router/router-guide.md
+++ b/docs/components/router/router-guide.md
@@ -256,6 +256,8 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
 - `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
+- `--no-router-track-prefill-tokens`: Disables prompt-side prefill token accounting in the router's active load model. By default (`router_track_prefill_tokens=true`), the router counts uncached prompt tokens toward `active_prefill_tokens`, queue pressure, and potential prefill-token load. Disable this for decode-only routing paths where prompt processing has already happened elsewhere and the decode router should ignore transferred prompt load. In normal live disaggregated serving, the decode-stage override applies this behavior automatically.
 - `--router-replica-sync`:  Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
 ### KV Indexer / Approx KV Indexer
@@ -280,6 +282,8 @@ Use `--no-router-kv-events` when you are not confident that your backend engine
 Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worker does not reuse transferred KV cache blocks. By default the router assumes KV blocks transferred from prefill to decode will be deduplicated on the decode side, but vLLM and SGLang decode workers currently do not support this — only TensorRT-LLM does. Without this flag, the router undercounts decode blocks when duplicates exist, leading to inaccurate load estimates.
+Use `--no-router-track-prefill-tokens` when a router is serving decode-only traffic and prompt processing has already completed elsewhere. This keeps decode routing decisions focused on decode-side load instead of briefly charging prompt tokens to the decode worker after handoff. The built-in live disaggregated decode path applies the equivalent per-request override automatically.
 Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default).
 The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
@@ -310,6 +314,11 @@ The prefill router is automatically created when:
 - **Seamlessly integrated** into the request pipeline between preprocessing and decode routing
 - **Falls back gracefully** to decode-only mode if prefill fails or no prefill workers are available
+**Key characteristics of the decode routing stage in disaggregated mode:**
+- **Disables overlap scoring** (`overlap_score_weight=0`) because decode routing should not chase prefix reuse
+- **Disables KV reuse assumption** (`assume_kv_reuse=false`) unless the backend can truly deduplicate transferred blocks
+- **Disables prefill-token tracking** (`track_prefill_tokens=false`) so decode-side load reflects decode work rather than already-completed prompt work
 ### Setup Example
 When both workers are registered, requests are automatically routed.

--- a/docs/mocker/mocker.md
+++ b/docs/mocker/mocker.md
@@ -96,7 +96,7 @@ python -m dynamo.mocker \
 | `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size |
 | `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens |
 | `--sglang-schedule-conservativeness` | 1.0 | SGLang schedule conservativeness factor |
-| `--aic-perf-model` | False | Use AIC SDK for latency prediction instead of interpolated/polynomial models. Requires `aiconfigurator` SDK installed (install with `pip install ai-dynamo[mocker]`) |
+| `--aic-perf-model` | False | Use AIC SDK for latency prediction instead of interpolated/polynomial models. Opt-in only: default mocker and replay paths do not use AIC. Requires `aiconfigurator` installed and usable AIC systems/perf data for the requested `system/backend/version` tuple |
 | `--aic-system` | `h200_sxm` | AIC system name (e.g., `h200_sxm`). Used with `--aic-perf-model` |
 | `--aic-backend-version` | Auto | AIC backend engine version (e.g., `0.12.0` for vLLM). If not set, uses the default version for the backend |
 | `--aic-tp-size` | 1 | Tensor parallel size for AIC latency prediction. Only affects AIC performance model lookups, not mocker scheduling |
@@ -126,10 +126,12 @@ python -m dynamo.mocker \
 The mocker supports replaying Mooncake-style traces through the dedicated replay CLI, which exposes
 `offline|online`, `round_robin|kv_router`, `arrival_speedup_ratio`, closed-loop concurrency
-admission, and synthetic workload generation directly:
+admission, synthetic workload generation, and offline disaggregated prefill/decode replay directly:
-The replay CLI defaults to `--replay-mode offline` and `--router-mode round_robin`. Engine settings
+The replay CLI defaults to `--replay-mode offline` and `--router-mode round_robin`. Aggregated
-such as `block_size`, `engine_type`, and compute speedups still belong in `--extra-engine-args`.
+replay uses `--extra-engine-args`. Offline disagg replay instead uses
+`--prefill-engine-args` plus `--decode-engine-args`, together with
+`--num-prefill-workers` and `--num-decode-workers`.
 ```bash
 python -m dynamo.replay /path/to/mooncake_trace.jsonl \
@@ -197,6 +199,31 @@ Replay supports aggregated `vllm` and `sglang` engine configs. Internally replay
 `block_size`; for `sglang`, `sglang.page_size` is still accepted as a compatibility alias as long
 as it matches `block_size` when both are provided.
+Offline replay also supports disaggregated `kv_router` mode. In that mode:
+- `--prefill-engine-args` must describe a prefill worker
+- `--decode-engine-args` must describe a decode worker
+- `--router-mode` must be `kv_router`
+- only offline replay is supported
+Example:
+```bash
+python -m dynamo.replay \
+    --input-tokens 4096 \
+    --output-tokens 256 \
+    --request-count 100 \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --replay-concurrency 32 \
+    --num-prefill-workers 2 \
+    --num-decode-workers 6 \
+    --prefill-engine-args '{"worker_type":"prefill","block_size":512}' \
+    --decode-engine-args '{"worker_type":"decode","block_size":512}' \
+    --router-config '{"router_queue_policy":"wspt"}' \
+    --report-json /tmp/replay-report.json
+```
 ## Performance Modeling Setup
 By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either:
@@ -223,7 +250,7 @@ python -m dynamo.mocker \
 To use the AIC SDK for latency prediction:
 ```bash
-pip install ai-dynamo[mocker]
+uv pip install '.[mocker]'
 python -m dynamo.mocker \
    --model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
@@ -234,13 +261,33 @@ python -m dynamo.mocker \
 The AIC model automatically uses `--model-path` and `--engine-type` to select the appropriate performance data. Available systems include `h200_sxm`, `h100_sxm`, etc. (see AIC SDK documentation for the full list).
-When using `python -m dynamo.replay`, there are no dedicated AIC flags. Pass the equivalent fields directly via `--extra-engine-args`:
+Important notes:
+- AIC is opt-in. If you do not pass `--aic-perf-model`, `python -m dynamo.mocker` does not use AIC.
+- `python -m dynamo.replay` also does not use AIC unless you explicitly put AIC fields in the engine-args JSON.
+- `aiconfigurator` must be able to load the requested performance database for the selected `system/backend/version`. If the SDK is installed but the backing systems data is missing or unreadable, mocker now fails fast at startup with a clear error instead of failing later on first request.
+- In development environments, this may require pointing Python at a source checkout of `aiconfigurator` with real Git LFS payloads materialized in its `systems/` directory.
+When using `python -m dynamo.replay`, there are no dedicated AIC flags. For aggregated replay,
+pass the equivalent fields via `--extra-engine-args`:
 ```bash
 python -m dynamo.replay /path/to/trace.jsonl \
    --extra-engine-args '{"aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1}'
 ```
+For offline disagg replay, pass the staged engine configs instead:
+```bash
+python -m dynamo.replay /path/to/trace.jsonl \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --prefill-engine-args '{"worker_type":"prefill","aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1,"block_size":512}' \
+    --decode-engine-args '{"worker_type":"decode","aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1,"block_size":512}' \
+    --num-prefill-workers 2 \
+    --num-decode-workers 6
+```
 The `aic_backend` field enables the AIC perf model and should match `engine_type` (`"vllm"` or `"sglang"`). The `aic_model_path` field is the equivalent of `--model-path` in `dynamo.mocker`.
 Example `--reasoning` configuration:
@@ -355,7 +402,7 @@ The mocker supports three timing prediction modes:
 **Interpolated Model:** Loads actual profiling data from an NPZ file containing measured prefill and decode latencies. The mocker interpolates between data points to predict timing for any input size. This enables high-fidelity simulation matching a specific hardware configuration.
-**AIC Model (`--aic-perf-model`):** Uses the NVIDIA AI Configurator (AIC) SDK for latency prediction. AIC provides calibrated performance models for specific GPU/model/engine combinations, predicting prefill and decode latency as a function of batch size, sequence length, and prefix cache hits. The model path is automatically derived from `--model-path`, and the engine type from `--engine-type`. This mode requires the `aiconfigurator` SDK, installable via `pip install ai-dynamo[mocker]`.
+**AIC Model (`--aic-perf-model`):** Uses the NVIDIA AI Configurator (AIC) SDK for latency prediction. AIC provides calibrated performance models for specific GPU/model/engine combinations, predicting prefill and decode latency as a function of batch size, sequence length, and prefix cache hits. The model path is automatically derived from `--model-path`, and the engine type from `--engine-type`. This mode is opt-in and requires both the `aiconfigurator` SDK and loadable systems/perf data for the requested tuple.
 ### Bootstrap Rendezvous (Disaggregated Serving)

--- a/lib/bench/kv_router/active_sequences_bench.rs
+++ b/lib/bench/kv_router/active_sequences_bench.rs
@@ -384,6 +384,7 @@ async fn apply_entry(
                    token_sequence: Some(block_hashes),
                    isl,
                    overlap: 0,
+                    track_prefill_tokens: true,
                    expected_output_tokens: Some(output_length as u32),
                    worker,
                    lora_name: None,

--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -497,6 +497,8 @@ impl RouterHandles {
        let config_override = if is_disaggregated {
            Some(RouterConfigOverride {
                overlap_score_weight: Some(0.0),
+                assume_kv_reuse: Some(false),
+                track_prefill_tokens: Some(false),
                ..Default::default()
            })
        } else {
@@ -573,6 +575,9 @@ fn kv_router_config_from_env() -> KvRouterConfig {
    if let Some(v) = env_bool("DYN_ROUTER_TRACK_OUTPUT_BLOCKS") {
        cfg.router_track_output_blocks = v;
    }
+    if let Some(v) = env_bool("DYN_ROUTER_TRACK_PREFILL_TOKENS") {
+        cfg.router_track_prefill_tokens = v;
+    }
    if let Some(v) = env_f64("DYN_ROUTER_QUEUE_THRESHOLD") {
        cfg.router_queue_threshold = Some(v);
    }
@@ -584,6 +589,7 @@ fn kv_router_config_from_env() -> KvRouterConfig {
        router_replica_sync = cfg.router_replica_sync,
        router_track_active_blocks = cfg.router_track_active_blocks,
        router_track_output_blocks = cfg.router_track_output_blocks,
+        router_track_prefill_tokens = cfg.router_track_prefill_tokens,
        router_queue_threshold = ?cfg.router_queue_threshold,
        "KvRouterConfig initialized (DYN_* env overrides applied)"
    );
@@ -862,6 +868,12 @@ pub unsafe extern "C" fn add_request(
        tokio::time::timeout(timeout_duration, async {
            let worker = WorkerWithDpRank::new(worker_id, dp_rank);
+            let router_config_override = RouterConfigOverride {
+                overlap_score_weight: Some(0.0),
+                assume_kv_reuse: Some(false),
+                track_prefill_tokens: Some(false),
+                ..Default::default()
+            };
            // Compute overlap_blocks using the public method
            let overlap_blocks = match decode_router
@@ -884,7 +896,7 @@ pub unsafe extern "C" fn add_request(
                    None,
                    worker,
                    None, // lora_name
-                    None, // router_config_override
+                    Some(&router_config_override),
                )
                .await;

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -58,7 +58,7 @@ impl KvRouterConfig {
 #[pymethods]
 impl KvRouterConfig {
    #[new]
-    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_enable_cache_control=false, min_initial_workers=1, router_queue_policy="fcfs", remote_indexer_component=None))]
+    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_enable_cache_control=false, min_initial_workers=1, router_queue_policy="fcfs", remote_indexer_component=None))]
    #[allow(clippy::too_many_arguments)]
    fn new(
        overlap_score_weight: f64,
@@ -69,6 +69,7 @@ impl KvRouterConfig {
        router_track_active_blocks: bool,
        router_track_output_blocks: bool,
        router_assume_kv_reuse: bool,
+        router_track_prefill_tokens: bool,
        router_snapshot_threshold: Option<u32>,
        router_reset_states: bool,
        router_ttl_secs: f64,
@@ -91,6 +92,7 @@ impl KvRouterConfig {
                router_track_active_blocks,
                router_track_output_blocks,
                router_assume_kv_reuse,
+                router_track_prefill_tokens,
                router_snapshot_threshold,
                router_reset_states,
                router_ttl_secs,

--- a/lib/bindings/python/rust/llm/replay.rs
+++ b/lib/bindings/python/rust/llm/replay.rs
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -1217,6 +1217,7 @@ class MockEngineArgs:
        dp_size: int = 1,
        startup_time: Optional[float] = None,
        worker_type: str = "aggregated",
+        planner_profile_data: Optional[str | os.PathLike[str]] = None,
        aic_backend: Optional[str] = None,
        aic_system: Optional[str] = None,
        aic_backend_version: Optional[str] = None,
@@ -1239,6 +1240,8 @@ class MockEngineArgs:
    def from_json(config_json: str) -> "MockEngineArgs":
        ...
+    def dump_json(self) -> str: ...
    @property
    def block_size(self) -> int: ...
@@ -1376,8 +1379,12 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi
 def run_mocker_trace_replay(
    trace_file: str | os.PathLike[str],
    extra_engine_args: Optional[MockEngineArgs] = None,
+    prefill_engine_args: Optional[MockEngineArgs] = None,
+    decode_engine_args: Optional[MockEngineArgs] = None,
    router_config: Optional[KvRouterConfig] = None,
    num_workers: int = 1,
+    num_prefill_workers: int = 1,
+    num_decode_workers: int = 1,
    replay_concurrency: Optional[int] = None,
    replay_mode: Literal["offline", "online"] = "offline",
    router_mode: Literal["round_robin", "kv_router"] = "round_robin",
@@ -1391,8 +1398,12 @@ def run_mocker_synthetic_trace_replay(
    output_tokens: int,
    request_count: int,
    extra_engine_args: Optional[MockEngineArgs] = None,
+    prefill_engine_args: Optional[MockEngineArgs] = None,
+    decode_engine_args: Optional[MockEngineArgs] = None,
    router_config: Optional[KvRouterConfig] = None,
    num_workers: int = 1,
+    num_prefill_workers: int = 1,
+    num_decode_workers: int = 1,
    replay_concurrency: Optional[int] = None,
    replay_mode: Literal["offline", "online"] = "offline",
    router_mode: Literal["round_robin", "kv_router"] = "round_robin",

--- a/lib/bindings/python/src/dynamo/replay/api.py
+++ b/lib/bindings/python/src/dynamo/replay/api.py
@@ -11,8 +11,12 @@ def run_trace_replay(
    trace_file,
    *,
    extra_engine_args=None,
+    prefill_engine_args=None,
+    decode_engine_args=None,
    router_config=None,
    num_workers=1,
+    num_prefill_workers=1,
+    num_decode_workers=1,
    replay_concurrency=None,
    replay_mode="offline",
    router_mode="round_robin",
@@ -21,8 +25,12 @@ def run_trace_replay(
    return _run_mocker_trace_replay(
        trace_file,
        extra_engine_args=extra_engine_args,
+        prefill_engine_args=prefill_engine_args,
+        decode_engine_args=decode_engine_args,
        router_config=router_config,
        num_workers=num_workers,
+        num_prefill_workers=num_prefill_workers,
+        num_decode_workers=num_decode_workers,
        replay_concurrency=replay_concurrency,
        replay_mode=replay_mode,
        router_mode=router_mode,
@@ -36,8 +44,12 @@ def run_synthetic_trace_replay(
    request_count,
    *,
    extra_engine_args=None,
+    prefill_engine_args=None,
+    decode_engine_args=None,
    router_config=None,
    num_workers=1,
+    num_prefill_workers=1,
+    num_decode_workers=1,
    replay_concurrency=None,
    replay_mode="offline",
    router_mode="round_robin",
@@ -53,8 +65,12 @@ def run_synthetic_trace_replay(
        output_tokens,
        request_count,
        extra_engine_args=extra_engine_args,
+        prefill_engine_args=prefill_engine_args,
+        decode_engine_args=decode_engine_args,
        router_config=router_config,
        num_workers=num_workers,
+        num_prefill_workers=num_prefill_workers,
+        num_decode_workers=num_decode_workers,
        replay_concurrency=replay_concurrency,
        replay_mode=replay_mode,
        router_mode=router_mode,

--- a/lib/bindings/python/src/dynamo/replay/main.py
+++ b/lib/bindings/python/src/dynamo/replay/main.py
@@ -4,24 +4,80 @@
 from __future__ import annotations
 import argparse
+import importlib
 import json
 import os
 import sys
 from collections.abc import Sequence
 from pathlib import Path
+from types import SimpleNamespace
+from typing import Protocol
 os.environ.setdefault("DYNAMO_SKIP_PYTHON_LOG_INIT", "1")
 from dynamo.llm import KvRouterConfig, MockEngineArgs
-from dynamo.mocker.args import resolve_planner_profile_data
 from dynamo.replay import run_synthetic_trace_replay, run_trace_replay
 from dynamo.replay.reporting import format_report_table, write_report_json
+class PlannerProfileDataResult(Protocol):
+    npz_path: Path | None
+def resolve_planner_profile_data(
+    planner_profile_data: Path | None,
+) -> PlannerProfileDataResult:
+    try:
+        module = importlib.import_module("dynamo.mocker.args")
+    except ImportError:
+        if planner_profile_data is None:
+            return SimpleNamespace(npz_path=None)
+        return SimpleNamespace(
+            npz_path=planner_profile_data
+            if planner_profile_data.suffix == ".npz"
+            else None
+        )
+    return module.resolve_planner_profile_data(planner_profile_data)
+def _load_engine_args(raw_args: str | None):
+    if raw_args is None:
+        return None
+    raw = json.loads(raw_args)
+    if not isinstance(raw, dict):
+        raise ValueError("engine-args must be a JSON object")
+    worker_type = raw.pop("worker_type", None)
+    if worker_type is not None:
+        if "is_prefill" in raw or "is_decode" in raw:
+            raise ValueError(
+                "worker_type cannot be combined with is_prefill or is_decode"
+            )
+        if worker_type == "prefill":
+            raw["is_prefill"] = True
+        elif worker_type == "decode":
+            raw["is_decode"] = True
+        elif worker_type != "aggregated":
+            raise ValueError(
+                "worker_type must be one of 'aggregated', 'prefill', or 'decode'"
+            )
+    if "planner_profile_data" in raw:
+        profile_data_result = resolve_planner_profile_data(
+            Path(raw["planner_profile_data"])
+        )
+        if profile_data_result.npz_path is not None:
+            raw["planner_profile_data"] = str(profile_data_result.npz_path)
+        else:
+            del raw["planner_profile_data"]
+    return MockEngineArgs.from_json(json.dumps(raw))
 def main(argv: Sequence[str] | None = None) -> int:
    parser = argparse.ArgumentParser(prog="python -m dynamo.replay")
    parser.add_argument("trace_file", nargs="?")
    parser.add_argument("--extra-engine-args")
+    parser.add_argument("--prefill-engine-args")
+    parser.add_argument("--decode-engine-args")
    parser.add_argument("--router-config")
    parser.add_argument("--input-tokens", type=int)
    parser.add_argument("--output-tokens", type=int)
@@ -36,6 +92,8 @@ def main(argv: Sequence[str] | None = None) -> int:
    parser.add_argument("--num-prefix-groups", type=int, default=0)
    parser.add_argument("--inter-turn-delay-ms", type=float, default=0.0)
    parser.add_argument("--num-workers", type=int, default=1)
+    parser.add_argument("--num-prefill-workers", type=int, default=1)
+    parser.add_argument("--num-decode-workers", type=int, default=1)
    parser.add_argument("--replay-concurrency", type=int)
    parser.add_argument(
        "--replay-mode",
@@ -74,24 +132,9 @@ def main(argv: Sequence[str] | None = None) -> int:
            "synthetic replay requires --input-tokens, --output-tokens, and --request-count"
        )
-    # Resolve planner_profile_data directory -> NPZ before passing to Rust.
+    extra_engine_args = _load_engine_args(args.extra_engine_args)
-    # Rust only accepts NPZ files; resolve_planner_profile_data handles conversion.
+    prefill_engine_args = _load_engine_args(args.prefill_engine_args)
-    profile_data_result = None
+    decode_engine_args = _load_engine_args(args.decode_engine_args)
-    if args.extra_engine_args is not None:
-        raw = json.loads(args.extra_engine_args)
-        if "planner_profile_data" in raw:
-            profile_data_result = resolve_planner_profile_data(
-                Path(raw["planner_profile_data"])
-            )
-            if profile_data_result.npz_path is not None:
-                raw["planner_profile_data"] = str(profile_data_result.npz_path)
-            else:
-                del raw["planner_profile_data"]
-            extra_engine_args = MockEngineArgs.from_json(json.dumps(raw))
-        else:
-            extra_engine_args = MockEngineArgs.from_json(args.extra_engine_args)
-    else:
-        extra_engine_args = None
    router_config = (
        KvRouterConfig.from_json(args.router_config)
        if args.router_config is not None
@@ -102,8 +145,12 @@ def main(argv: Sequence[str] | None = None) -> int:
        report = run_trace_replay(
            args.trace_file,
            extra_engine_args=extra_engine_args,
+            prefill_engine_args=prefill_engine_args,
+            decode_engine_args=decode_engine_args,
            router_config=router_config,
            num_workers=args.num_workers,
+            num_prefill_workers=args.num_prefill_workers,
+            num_decode_workers=args.num_decode_workers,
            replay_concurrency=args.replay_concurrency,
            replay_mode=args.replay_mode,
            router_mode=args.router_mode,
@@ -115,8 +162,12 @@ def main(argv: Sequence[str] | None = None) -> int:
            args.output_tokens,
            args.request_count,
            extra_engine_args=extra_engine_args,
+            prefill_engine_args=prefill_engine_args,
+            decode_engine_args=decode_engine_args,
            router_config=router_config,
            num_workers=args.num_workers,
+            num_prefill_workers=args.num_prefill_workers,
+            num_decode_workers=args.num_decode_workers,
            replay_concurrency=args.replay_concurrency,
            replay_mode=args.replay_mode,
            router_mode=args.router_mode,

--- a/lib/bindings/python/tests/test_replay.py
+++ b/lib/bindings/python/tests/test_replay.py
@@ -5,12 +5,13 @@ import json
 import os
 import subprocess
 import sys
+from pathlib import Path
+import numpy as np
 import pytest
 from dynamo.llm import KvRouterConfig, MockEngineArgs
 from dynamo.replay import run_synthetic_trace_replay, run_trace_replay
-from dynamo.replay.main import main
 from dynamo.replay.reporting import format_report_table, write_report_json
 pytestmark = [
@@ -75,6 +76,7 @@ def _router_config_payload():
        "router_track_active_blocks": True,
        "router_track_output_blocks": False,
        "router_assume_kv_reuse": True,
+        "router_track_prefill_tokens": True,
        "router_snapshot_threshold": 1000000,
        "router_reset_states": False,
        "router_ttl_secs": 120.0,
@@ -194,6 +196,14 @@ def _sglang_args():
    return MockEngineArgs.from_json(json.dumps(_sglang_args_payload()))
+def _prefill_args():
+    return MockEngineArgs(block_size=64, speedup_ratio=1000.0, worker_type="prefill")
+def _decode_args():
+    return MockEngineArgs(block_size=64, speedup_ratio=1000.0, worker_type="decode")
 def _write_router_config(tmp_path):
    config_path = tmp_path / "router_config.json"
    config_path.write_text(
@@ -229,8 +239,12 @@ def _assert_basic_report_metrics(report):
 def _replay_cli_env() -> dict[str, str]:
+    repo_root = Path(__file__).resolve().parents[4]
    env = os.environ.copy()
-    pythonpath_entries = ["lib/bindings/python/src", "components/src"]
+    pythonpath_entries = [
+        str(repo_root / "lib/bindings/python/src"),
+        str(repo_root / "components/src"),
+    ]
    existing_pythonpath = env.get("PYTHONPATH")
    if existing_pythonpath:
        pythonpath_entries.append(existing_pythonpath)
@@ -238,6 +252,33 @@ def _replay_cli_env() -> dict[str, str]:
    return env
+def _planner_profile_data_npz_path() -> Path:
+    return (
+        Path(__file__).resolve().parents[4]
+        / "benchmarks/results/H200_TP1P_TP1D_perf_data.npz"
+    )
+def _planner_profile_data_dir_path() -> Path:
+    return (
+        Path(__file__).resolve().parents[4]
+        / "tests/planner/profiling_results/H200_TP1P_TP1D"
+    )
+def _write_planner_profile_data_npz(tmp_path: Path) -> Path:
+    planner_profile_data = tmp_path / "planner_profile_data.npz"
+    np.savez(
+        planner_profile_data,
+        prefill_isl=np.array([128.0, 256.0]),
+        prefill_ttft_ms=np.array([4.0, 8.0]),
+        decode_active_kv_tokens=np.array([1024.0, 2048.0]),
+        decode_context_length=np.array([128.0, 256.0]),
+        decode_itl=np.array([[1.0, 1.5], [2.0, 2.5]]),
+    )
+    return planner_profile_data
 def _run_replay_cli(tmp_path, *args):
    return subprocess.run(
        [
@@ -264,11 +305,27 @@ def _assert_replay_cli_outputs(completed, report_path):
 @pytest.mark.parametrize("engine_type", ["vllm", "sglang"])
 @pytest.mark.parametrize("replay_mode", ["offline", "online"])
 @pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
-def test_run_trace_replay_smoke_matrix(tmp_path, engine_type, replay_mode, router_mode):
+@pytest.mark.parametrize("serving_mode", ["agg", "disagg"])
+def test_run_trace_replay_smoke_matrix(
+    tmp_path, engine_type, replay_mode, router_mode, serving_mode
+):
    trace_path = _write_trace_and_args(tmp_path)
+    if serving_mode == "disagg":
+        if replay_mode != "offline":
+            pytest.skip("disagg replay only supports offline mode")
+        report = run_trace_replay(
+            trace_path,
+            prefill_engine_args=_prefill_args(),
+            decode_engine_args=_decode_args(),
+            router_config=_router_config(),
+            num_prefill_workers=2,
+            num_decode_workers=2,
+            replay_mode=replay_mode,
+            router_mode=router_mode,
+        )
+    else:
        args_path = _vllm_args() if engine_type == "vllm" else _sglang_args()
        num_workers = 1 if router_mode == "round_robin" else 2
        report = run_trace_replay(
            trace_path,
            extra_engine_args=args_path,
@@ -345,12 +402,29 @@ def test_run_trace_replay_supports_multiturn_sessions(tmp_path, replay_mode):
 @pytest.mark.parametrize("engine_type", ["vllm", "sglang"])
 @pytest.mark.parametrize("replay_mode", ["offline", "online"])
 @pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
+@pytest.mark.parametrize("serving_mode", ["agg", "disagg"])
 def test_run_synthetic_trace_replay_smoke_matrix(
-    tmp_path, engine_type, replay_mode, router_mode
+    tmp_path, engine_type, replay_mode, router_mode, serving_mode
 ):
+    if serving_mode == "disagg":
+        if replay_mode != "offline":
+            pytest.skip("disagg replay only supports offline mode")
+        report = run_synthetic_trace_replay(
+            64,
+            2,
+            2,
+            prefill_engine_args=_prefill_args(),
+            decode_engine_args=_decode_args(),
+            router_config=_router_config(),
+            num_prefill_workers=2,
+            num_decode_workers=2,
+            replay_mode=replay_mode,
+            router_mode=router_mode,
+            arrival_interval_ms=5.0,
+        )
+    else:
        args_path = _vllm_args() if engine_type == "vllm" else _sglang_args()
        num_workers = 1 if router_mode == "round_robin" else 2
        report = run_synthetic_trace_replay(
            64,
            2,
@@ -553,6 +627,103 @@ def test_run_trace_replay_accepts_partial_extra_engine_args_json(tmp_path, repla
    )
+@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
+def test_run_trace_replay_supports_disagg_offline(tmp_path, router_mode):
+    trace_path = _write_trace_and_args(tmp_path)
+    report = run_trace_replay(
+        trace_path,
+        prefill_engine_args=_prefill_args(),
+        decode_engine_args=_decode_args(),
+        router_config=_router_config(),
+        num_prefill_workers=2,
+        num_decode_workers=2,
+        replay_mode="offline",
+        router_mode=router_mode,
+    )
+    _assert_basic_report_counts(
+        report,
+        num_requests=2,
+        input_tokens=64,
+        output_tokens=2,
+    )
+    _assert_basic_report_metrics(report)
+@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
+def test_run_synthetic_trace_replay_disagg_preserves_expected_output_tokens(
+    router_mode,
+):
+    report = run_synthetic_trace_replay(
+        128,
+        7,
+        6,
+        prefill_engine_args=_prefill_args(),
+        decode_engine_args=_decode_args(),
+        router_config=_router_config(),
+        num_prefill_workers=2,
+        num_decode_workers=2,
+        replay_mode="offline",
+        router_mode=router_mode,
+    )
+    _assert_basic_report_counts(
+        report,
+        num_requests=6,
+        input_tokens=128,
+        output_tokens=7,
+    )
+    _assert_basic_report_metrics(report)
+def test_run_trace_replay_rejects_partial_disagg_args(tmp_path):
+    trace_path = _write_trace_and_args(tmp_path)
+    with pytest.raises(Exception, match="must be provided together"):
+        run_trace_replay(
+            trace_path,
+            prefill_engine_args=_prefill_args(),
+            replay_mode="offline",
+            router_mode="kv_router",
+        )
+def test_run_trace_replay_rejects_online_disagg(tmp_path):
+    trace_path = _write_trace_and_args(tmp_path)
+    with pytest.raises(
+        Exception, match="disagg replay only supports replay_mode='offline'"
+    ):
+        run_trace_replay(
+            trace_path,
+            prefill_engine_args=_prefill_args(),
+            decode_engine_args=_decode_args(),
+            router_config=_router_config(),
+            num_prefill_workers=2,
+            num_decode_workers=2,
+            replay_mode="online",
+            router_mode="kv_router",
+        )
+def test_run_trace_replay_rejects_disagg_worker_counts_for_aggregated_mode(tmp_path):
+    trace_path = _write_trace_and_args(tmp_path)
+    with pytest.raises(
+        Exception,
+        match="num_prefill_workers and num_decode_workers are only used for disagg replay",
+    ):
+        run_trace_replay(
+            trace_path,
+            extra_engine_args=MockEngineArgs(block_size=64, speedup_ratio=1000.0),
+            num_workers=1,
+            num_prefill_workers=2,
+            num_decode_workers=2,
+            replay_mode="offline",
+        )
 def test_format_report_table_matches_aiperf_shape():
    report = {
        "mean_ttft_ms": 18.26,
@@ -616,91 +787,49 @@ def test_write_report_json_creates_file(tmp_path):
    )
-def test_replay_cli_prints_table_and_saves_json(tmp_path, monkeypatch, capsys):
+@pytest.mark.timeout(30)
-    report = {
+def test_replay_cli_subprocess_synthetic_smoke(tmp_path):
-        "mean_ttft_ms": 10.0,
+    report_path = tmp_path / "synthetic_report.json"
-        "min_ttft_ms": 9.0,
-        "max_ttft_ms": 12.0,
-        "p99_ttft_ms": 12.0,
-        "p90_ttft_ms": 11.0,
-        "p75_ttft_ms": 10.5,
-        "std_ttft_ms": 1.0,
-        "output_throughput_tok_s": 123.0,
-        "request_throughput_rps": 4.0,
-        "completed_requests": 3,
-    }
-    def fake_run(*args, **kwargs):
-        return report
-    monkeypatch.setattr("dynamo.replay.main.run_synthetic_trace_replay", fake_run)
-    report_path = tmp_path / "cli_report.json"
-    exit_code = main(
+    completed = _run_replay_cli(
-        [
+        tmp_path,
        "--input-tokens",
-            "16",
+        "250",
        "--output-tokens",
-            "8",
+        "25",
        "--request-count",
-            "3",
+        "10",
+        "--num-workers",
+        "4",
+        "--replay-concurrency",
+        "4",
        "--report-json",
        str(report_path),
-        ]
+        "--extra-engine-args",
+        '{"block_size":64,"speedup_ratio":1000.0}',
    )
-    assert exit_code == 0
+    report = _assert_replay_cli_outputs(completed, report_path)
-    stdout = capsys.readouterr().out
+    _assert_basic_report_counts(
-    assert "NVIDIA AIPerf | LLM Metrics" in stdout
+        report,
-    assert "Saved full report to:" in stdout
+        num_requests=10,
-    assert '"completed_requests"' not in stdout
+        input_tokens=250,
-    assert json.loads(report_path.read_text(encoding="utf-8")) == report
+        output_tokens=25,
-def test_replay_cli_passes_multiturn_workload_kwargs(monkeypatch):
-    captured = {}
-    def fake_run(*args, **kwargs):
-        captured["args"] = args
-        captured["kwargs"] = kwargs
-        return {
-            "completed_requests": 4,
-            "request_throughput_rps": 1.0,
-            "output_throughput_tok_s": 1.0,
-        }
-    monkeypatch.setattr("dynamo.replay.main.run_synthetic_trace_replay", fake_run)
-    exit_code = main(
-        [
-            "--input-tokens",
-            "16",
-            "--output-tokens",
-            "8",
-            "--request-count",
-            "2",
-            "--turns-per-session",
-            "2",
-            "--shared-prefix-ratio",
-            "0.5",
-            "--num-prefix-groups",
-            "3",
-            "--inter-turn-delay-ms",
-            "7.0",
-        ]
    )
+    _assert_basic_report_metrics(report)
-    assert exit_code == 0
-    assert captured["args"] == (16, 8, 2)
-    assert captured["kwargs"]["turns_per_session"] == 2
-    assert captured["kwargs"]["shared_prefix_ratio"] == 0.5
-    assert captured["kwargs"]["num_prefix_groups"] == 3
-    assert captured["kwargs"]["inter_turn_delay_ms"] == 7.0
 @pytest.mark.timeout(30)
-def test_replay_cli_subprocess_synthetic_smoke(tmp_path):
+@pytest.mark.parametrize("planner_profile_data_kind", ["dir", "npz"])
-    report_path = tmp_path / "synthetic_report.json"
+def test_replay_cli_subprocess_synthetic_smoke_accepts_planner_profile_data(
+    tmp_path, planner_profile_data_kind
+):
+    report_path = tmp_path / f"synthetic_report_{planner_profile_data_kind}.json"
+    planner_profile_data = (
+        _planner_profile_data_dir_path()
+        if planner_profile_data_kind == "dir"
+        else _write_planner_profile_data_npz(tmp_path)
+    )
    completed = _run_replay_cli(
        tmp_path,
@@ -717,7 +846,13 @@ def test_replay_cli_subprocess_synthetic_smoke(tmp_path):
        "--report-json",
        str(report_path),
        "--extra-engine-args",
-        '{"block_size":64,"speedup_ratio":1000.0}',
+        json.dumps(
+            {
+                "block_size": 64,
+                "speedup_ratio": 1000.0,
+                "planner_profile_data": str(planner_profile_data),
+            }
+        ),
    )
    report = _assert_replay_cli_outputs(completed, report_path)
@@ -798,6 +933,40 @@ def test_replay_cli_subprocess_trace_smoke(tmp_path):
    _assert_basic_report_metrics(report)
+@pytest.mark.timeout(30)
+def test_replay_cli_subprocess_trace_disagg_smoke(tmp_path):
+    trace_path = _write_cli_smoke_trace(tmp_path)
+    report_path = tmp_path / "trace_disagg_report.json"
+    completed = _run_replay_cli(
+        tmp_path,
+        str(trace_path),
+        "--replay-mode",
+        "offline",
+        "--router-mode",
+        "kv_router",
+        "--num-prefill-workers",
+        "2",
+        "--num-decode-workers",
+        "2",
+        "--report-json",
+        str(report_path),
+        "--prefill-engine-args",
+        '{"block_size":64,"speedup_ratio":1000.0,"worker_type":"prefill"}',
+        "--decode-engine-args",
+        '{"block_size":64,"speedup_ratio":1000.0,"worker_type":"decode"}',
+    )
+    report = _assert_replay_cli_outputs(completed, report_path)
+    _assert_basic_report_counts(
+        report,
+        num_requests=10,
+        input_tokens=250,
+        output_tokens=25,
+    )
+    _assert_basic_report_metrics(report)
 @pytest.mark.timeout(30)
 def test_replay_cli_subprocess_multiturn_trace_smoke(tmp_path):
    trace_path = _write_multiturn_trace(tmp_path)

--- a/lib/kv-router/src/protocols.rs
+++ b/lib/kv-router/src/protocols.rs
@@ -8,6 +8,10 @@ use rustc_hash::FxHashMap;
 use serde::{Deserialize, Serialize};
 use xxhash_rust::xxh3;
+const fn default_track_prefill_tokens() -> bool {
+    true
+}
 /// The event subject that workers publish KV cache events on.
 pub const KV_EVENT_SUBJECT: &str = "kv-events";
@@ -431,6 +435,8 @@ pub enum ActiveSequenceEventData {
        token_sequence: Option<Vec<SequenceHash>>,
        isl: usize,
        overlap: u32,
+        #[serde(default = "default_track_prefill_tokens")]
+        track_prefill_tokens: bool,
        expected_output_tokens: Option<u32>,
    },
    Free,
@@ -990,14 +996,6 @@ mod tests {
        assert_ne!(lora_a[0], lora_b[0]);
    }
-    #[test]
-    fn test_lora_name_none_matches_legacy() {
-        let tokens: Vec<u32> = (0..8).collect();
-        let hashes_none = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
-        let hashes_none2 = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
-        assert_eq!(hashes_none, hashes_none2);
-    }
    #[test]
    fn test_lora_name_empty_string_normalized_to_none() {
        let tokens: Vec<u32> = (0..4).collect();
@@ -1172,16 +1170,6 @@ mod tests {
        assert_eq!(deserialized.block_hashes[1].0, 5);
    }
-    #[test]
-    fn test_router_request_mark_free_backwards_compatible_deserialization() {
-        let request: RouterRequest = serde_json::from_str(r#"{"method":"mark_free"}"#).unwrap();
-        assert!(matches!(
-            request,
-            RouterRequest::MarkFree { request_id: None }
-        ));
-    }
    #[test]
    fn test_router_request_mark_free_serialization_with_request_id() {
        let request = RouterRequest::MarkFree {

--- a/lib/kv-router/src/scheduling/config.rs
+++ b/lib/kv-router/src/scheduling/config.rs
@@ -17,6 +17,10 @@ const fn default_min_initial_workers() -> usize {
    1
 }
+const fn default_track_prefill_tokens() -> bool {
+    true
+}
 #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "lowercase")]
 pub enum RouterQueuePolicy {
@@ -63,6 +67,9 @@ pub struct RouterConfigOverride {
    #[builder(default)]
    pub assume_kv_reuse: Option<bool>,
+    #[builder(default)]
+    pub track_prefill_tokens: Option<bool>,
 }
 /// KV Router configuration parameters
@@ -98,6 +105,12 @@ pub struct KvRouterConfig {
    /// When false, generates random hashes (assuming no KV cache reuse).
    pub router_assume_kv_reuse: bool,
+    /// Whether to include prompt-side prefill tokens in active load accounting (default: true).
+    /// When false, prompt tokens are excluded from active prefill token tracking, queue pressure,
+    /// and potential prefill-token load calculations.
+    #[serde(default = "default_track_prefill_tokens")]
+    pub router_track_prefill_tokens: bool,
    /// Threshold for triggering snapshots. If None, no snapshots will be performed.
    #[validate(range(min = 1))]
    pub router_snapshot_threshold: Option<u32>,
@@ -171,6 +184,7 @@ impl Default for KvRouterConfig {
            router_track_active_blocks: true,
            router_track_output_blocks: false,
            router_assume_kv_reuse: true,
+            router_track_prefill_tokens: default_track_prefill_tokens(),
            router_snapshot_threshold: Some(1000000),
            router_reset_states: false,
            router_ttl_secs: 120.0,
@@ -208,6 +222,18 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr
 }
 impl KvRouterConfig {
+    pub fn assume_kv_reuse(&self, config_override: Option<&RouterConfigOverride>) -> bool {
+        config_override
+            .and_then(|cfg| cfg.assume_kv_reuse)
+            .unwrap_or(self.router_assume_kv_reuse)
+    }
+    pub fn track_prefill_tokens(&self, config_override: Option<&RouterConfigOverride>) -> bool {
+        config_override
+            .and_then(|cfg| cfg.track_prefill_tokens)
+            .unwrap_or(self.router_track_prefill_tokens)
+    }
    /// Compute sequence hashes for active block tracking based on configuration.
    ///
    /// Returns:
@@ -231,9 +257,7 @@ impl KvRouterConfig {
            return Some(Vec::new());
        }
-        let assume_kv_reuse = config_override
+        let assume_kv_reuse = self.assume_kv_reuse(config_override);
-            .and_then(|cfg| cfg.assume_kv_reuse)
-            .unwrap_or(self.router_assume_kv_reuse);
        if assume_kv_reuse {
            let block_hashes = match precomputed_block_hashes {
@@ -290,6 +314,11 @@ mod tests {
        assert_eq!(KvRouterConfig::default().min_initial_workers, 1);
    }
+    #[test]
+    fn kv_router_config_defaults_to_tracking_prefill_tokens() {
+        assert!(KvRouterConfig::default().router_track_prefill_tokens);
+    }
    #[test]
    fn kv_router_config_rejects_zero_initial_workers() {
        let cfg = KvRouterConfig {
@@ -332,6 +361,17 @@ mod tests {
        assert_ne!(without_mm, with_mm);
    }
+    #[test]
+    fn router_config_override_serde_round_trip_preserves_track_prefill_tokens() {
+        let serialized = serde_json::to_string(&RouterConfigOverride {
+            track_prefill_tokens: Some(false),
+            ..Default::default()
+        })
+        .unwrap();
+        let deserialized: RouterConfigOverride = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(deserialized.track_prefill_tokens, Some(false));
+    }
    #[test]
    fn compute_seq_hashes_for_tracking_uses_precomputed_block_hashes() {
        let config = KvRouterConfig::default();

--- a/lib/kv-router/src/scheduling/local.rs
+++ b/lib/kv-router/src/scheduling/local.rs
@@ -30,6 +30,7 @@ where
    request_tx: mpsc::Sender<SchedulingRequest>,
    slots: Arc<ActiveSequencesMultiWorker<P>>,
    queue: Arc<SchedulerQueue<P, C, S, Sel>>,
+    track_prefill_tokens_default: bool,
    worker_type: &'static str,
 }
@@ -48,6 +49,7 @@ where
        block_size: u32,
        selector: Sel,
        policy: S,
+        track_prefill_tokens_default: bool,
        cancellation_token: CancellationToken,
        worker_type: &'static str,
        monitor_worker_configs: bool,
@@ -135,6 +137,7 @@ where
            request_tx,
            slots,
            queue,
+            track_prefill_tokens_default,
            worker_type,
        }
    }
@@ -154,6 +157,9 @@ where
        allowed_worker_ids: Option<HashSet<WorkerId>>,
    ) -> Result<SchedulingResponse, KvSchedulerError> {
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
+        let track_prefill_tokens = router_config_override
+            .and_then(|cfg| cfg.track_prefill_tokens)
+            .unwrap_or(self.track_prefill_tokens_default);
        let request = SchedulingRequest {
            maybe_request_id,
            token_seq,
@@ -161,6 +167,7 @@ where
            overlaps,
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
+            track_prefill_tokens,
            router_config_override: router_config_override.cloned(),
            update_states,
            lora_name,
@@ -224,10 +231,16 @@ where
        token_seq: Option<Vec<SequenceHash>>,
        isl_tokens: usize,
        overlaps: OverlapScores,
+        track_prefill_tokens: bool,
    ) -> Vec<PotentialLoad> {
-        let (decode_blocks, prefill_tokens) =
+        let (decode_blocks, prefill_tokens) = self
-            self.slots
+            .slots
-                .potential_blocks_and_tokens(token_seq.as_deref(), isl_tokens, overlaps);
+            .potential_blocks_and_tokens_with_prefill_tracking(
+                token_seq.as_deref(),
+                isl_tokens,
+                overlaps,
+                track_prefill_tokens,
+            );
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
@@ -300,6 +313,7 @@ mod tests {
            64,
            DefaultWorkerSelector::new(None, "test"),
            FcfsPolicy,
+            true,
            cancel_token.clone(),
            "test",
            monitor_worker_configs,
@@ -344,6 +358,48 @@ mod tests {
        cancel_token.cancel();
    }
+    #[tokio::test]
+    async fn test_schedule_override_can_disable_prefill_tracking() {
+        let mut workers = HashMap::new();
+        workers.insert(
+            0,
+            SimpleWorkerConfig {
+                max_num_batched_tokens: Some(64),
+                ..Default::default()
+            },
+        );
+        let (scheduler, slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true);
+        scheduler
+            .schedule(
+                Some("req-1".to_string()),
+                64,
+                Some(vec![1, 2, 3, 4]),
+                OverlapScores::default(),
+                Some(&crate::config::RouterConfigOverride {
+                    track_prefill_tokens: Some(false),
+                    ..Default::default()
+                }),
+                true,
+                None,
+                0.0,
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+        assert_eq!(
+            slots
+                .active_tokens()
+                .get(&WorkerWithDpRank::new(0, 0))
+                .copied(),
+            Some(0)
+        );
+        cancel_token.cancel();
+    }
    #[tokio::test]
    async fn test_mark_prefill_completed_drains_pending_queue() {
        let mut workers = HashMap::new();
@@ -474,7 +530,7 @@ mod tests {
            .collect();
        expected.sort_by_key(|load| (load.worker_id, load.dp_rank));
-        let mut actual = scheduler.get_potential_loads(Some(token_seq), 128, overlaps);
+        let mut actual = scheduler.get_potential_loads(Some(token_seq), 128, overlaps, true);
        actual.sort_by_key(|load| (load.worker_id, load.dp_rank));
        assert_eq!(actual.len(), expected.len());
@@ -500,7 +556,7 @@ mod tests {
            make_scheduler(HashMap::new(), None, false);
        scheduler.register_workers(&HashSet::from([42]));
-        let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default());
+        let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default(), true);
        assert_eq!(loads.len(), 1);
        assert_eq!(loads[0].worker_id, 42);
@@ -517,7 +573,7 @@ mod tests {
        assert_eq!(
            scheduler
-                .get_potential_loads(None, 64, OverlapScores::default())
+                .get_potential_loads(None, 64, OverlapScores::default(), true)
                .len(),
            1
        );
@@ -536,7 +592,7 @@ mod tests {
        tokio::time::timeout(Duration::from_secs(1), async {
            loop {
                if scheduler
-                    .get_potential_loads(None, 64, OverlapScores::default())
+                    .get_potential_loads(None, 64, OverlapScores::default(), true)
                    .len()
                    == 3
                {
@@ -550,4 +606,39 @@ mod tests {
        cancel_token.cancel();
    }
+    #[tokio::test]
+    async fn test_get_potential_loads_can_ignore_prefill_tokens() {
+        let mut workers = HashMap::new();
+        workers.insert(
+            0,
+            SimpleWorkerConfig {
+                max_num_batched_tokens: Some(256),
+                ..Default::default()
+            },
+        );
+        let (scheduler, _slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true);
+        scheduler
+            .schedule(
+                Some("req-1".to_string()),
+                64,
+                Some(vec![11, 22]),
+                OverlapScores::default(),
+                None,
+                true,
+                None,
+                0.0,
+                None,
+                None,
+            )
+            .await
+            .unwrap();
+        let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default(), false);
+        assert_eq!(loads.len(), 1);
+        assert_eq!(loads[0].potential_prefill_tokens, 64);
+        cancel_token.cancel();
+    }
 }