chore(replay): refactor offline components into cleaner lanes (#7866)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore(replay): refactor offline components into cleaner lanes (#7866)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
95a750f4 · Yan Ru Pei · GitHub · 210bbf5d · 95a750f4 · 95a750f4
Unverified Commit 95a750f4 authored Apr 06, 2026 by Yan Ru Pei Committed by GitHub Apr 06, 2026
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2018,6 +2018,7 @@ dependencies = [
 "derive_builder",
 "dynamo-kv-router",
 "dynamo-tokens",
+ "indicatif 0.18.4",
 "ndarray 0.16.1",
 "ndarray-interp",
 "ndarray-npy",

--- a/components/src/dynamo/common/configuration/groups/aic_perf_args.py
+++ b/components/src/dynamo/common/configuration/groups/aic_perf_args.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Shared AIC perf-model configuration ArgGroup."""
+
+from typing import Optional
+
+from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.config_base import ConfigBase
+from dynamo.common.configuration.utils import add_argument
+
+_AIC_PERF_FIELDS: tuple[str, ...] = (
+    "aic_backend",
+    "aic_system",
+    "aic_backend_version",
+    "aic_tp_size",
+    "aic_model_path",
+)
+
+
+class AicPerfConfigBase(ConfigBase):
+    aic_backend: Optional[str]
+    aic_system: Optional[str]
+    aic_backend_version: Optional[str]
+    aic_tp_size: int
+    aic_model_path: Optional[str]
+
+    def aic_perf_kwargs(self) -> dict:
+        return {field: getattr(self, field) for field in _AIC_PERF_FIELDS}
+
+
+class AicPerfArgGroup(ArgGroup):
+    def add_arguments(self, parser) -> None:
+        g = parser.add_argument_group("AIC Perf Model Options")
+
+        add_argument(
+            g,
+            flag_name="--aic-backend",
+            env_var="DYN_AIC_BACKEND",
+            default=None,
+            help=(
+                "[EXPERIMENTAL] AIC backend family to model "
+                "(for example: vllm or sglang)."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--aic-system",
+            env_var="DYN_AIC_SYSTEM",
+            default=None,
+            help=(
+                "[EXPERIMENTAL] AIC hardware/system identifier "
+                "(for example: h200_sxm)."
+            ),
+        )
+        add_argument(
+            g,
+            flag_name="--aic-backend-version",
+            env_var="DYN_AIC_BACKEND_VERSION",
+            default=None,
+            help="[EXPERIMENTAL] Pinned backend version for AIC database lookup.",
+        )
+        add_argument(
+            g,
+            flag_name="--aic-tp-size",
+            env_var="DYN_AIC_TP_SIZE",
+            default=1,
+            help="[EXPERIMENTAL] Tensor parallel size to model in AIC.",
+            arg_type=int,
+        )
+        add_argument(
+            g,
+            flag_name="--aic-model-path",
+            env_var="DYN_AIC_MODEL_PATH",
+            default=None,
+            help=(
+                "[EXPERIMENTAL] Model path or model identifier to use for "
+                "AIC perf lookup."
+            ),
+        )
--- a/components/src/dynamo/common/configuration/groups/kv_router_args.py
+++ b/components/src/dynamo/common/configuration/groups/kv_router_args.py
@@ -27,6 +27,7 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
    "router_track_output_blocks",
    "router_assume_kv_reuse",
    "router_track_prefill_tokens",
+    "router_prefill_load_model",
    "router_snapshot_threshold",
    "router_reset_states",
    "router_ttl_secs",
@@ -51,6 +52,7 @@ class KvRouterConfigBase(ConfigBase):
    router_track_output_blocks: bool
    router_assume_kv_reuse: bool
    router_track_prefill_tokens: bool
+    router_prefill_load_model: str
    router_snapshot_threshold: int
    router_reset_states: bool
    router_ttl_secs: float
@@ -183,6 +185,18 @@ class KvRouterArgGroup(ArgGroup):
                "prefill-token load, queue pressure, and active_prefill_tokens metrics."
            ),
        )
+        add_argument(
+            g,
+            flag_name="--router-prefill-load-model",
+            env_var="DYN_ROUTER_PREFILL_LOAD_MODEL",
+            default="none",
+            choices=["none", "aic"],
+            help=(
+                "[EXPERIMENTAL] KV Router: Prompt-side prefill load model. "
+                "'none' keeps static prompt load accounting. "
+                "'aic' decays the oldest active prefill request using AIC-predicted duration."
+            ),
+        )
        add_argument(
            g,
            flag_name="--router-snapshot-threshold",

--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -8,6 +8,10 @@ from typing import Any, Dict, Optional

 from dynamo.common.config_dump import register_encoder
 from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.groups.aic_perf_args import (
+    AicPerfArgGroup,
+    AicPerfConfigBase,
+)
 from dynamo.common.configuration.groups.kv_router_args import (
    KvRouterArgGroup,
    KvRouterConfigBase,
@@ -39,7 +43,7 @@ def validate_model_path(value: str) -> str:
    return value


-class FrontendConfig(KvRouterConfigBase):
+class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
    """Configuration for the Dynamo frontend."""

    interactive: bool
@@ -98,6 +102,34 @@ class FrontendConfig(KvRouterConfigBase):
                f"--tokenizer: invalid value '{self.tokenizer_backend}' "
                f"(choose from {sorted(self._VALID_TOKENIZER_BACKENDS)})"
            )
+        if self.router_prefill_load_model == "aic":
+            if self.router_mode != "kv":
+                raise ValueError(
+                    "--router-prefill-load-model=aic requires --router-mode=kv"
+                )
+            if self.chat_processor != "dynamo":
+                raise ValueError(
+                    "--router-prefill-load-model=aic currently requires "
+                    "--dyn-chat-processor=dynamo"
+                )
+            missing = [
+                flag
+                for flag, value in (
+                    ("--aic-backend", self.aic_backend),
+                    ("--aic-system", self.aic_system),
+                    ("--aic-model-path", self.aic_model_path),
+                )
+                if not value
+            ]
+            if missing:
+                raise ValueError(
+                    "--router-prefill-load-model=aic requires " + ", ".join(missing)
+                )
+            if not self.router_track_prefill_tokens:
+                raise ValueError(
+                    "--router-prefill-load-model=aic requires "
+                    "--router-track-prefill-tokens"
+                )


 @register_encoder(FrontendConfig)
@@ -214,6 +246,7 @@ class FrontendArgGroup(ArgGroup):

        # KV router options (shared with dynamo.router)
        KvRouterArgGroup().add_arguments(parser)
+        AicPerfArgGroup().add_arguments(parser)

        add_argument(
            g,

--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -29,6 +29,7 @@ import uvloop

 from dynamo.common.config_dump import dump_config
 from dynamo.llm import (
+    AicPerfConfig,
    EngineType,
    EntrypointArgs,
    KvRouterConfig,
@@ -302,6 +303,9 @@ async def async_main():
        ).chat_engine_factory
        kwargs["chat_engine_factory"] = chat_engine_factory

+    if config.router_prefill_load_model == "aic":
+        kwargs["aic_perf_config"] = AicPerfConfig(**config.aic_perf_kwargs())
+
    e = EntrypointArgs(EngineType.Dynamic, **kwargs)
    engine = await make_engine(runtime, e)


--- a/components/src/dynamo/mocker/aic_session.py
+++ b/components/src/dynamo/mocker/aic_session.py
-#  SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

-"""
-AIC (AI Configurator) direct session wrapper for mocker perf model.
+"""Backward-compatible mocker wrapper around the shared internal AIC bridge."""

-Provides a Python class that wraps the AIC InferenceSession and exposes
-predict_prefill() and predict_decode() methods callable from Rust via PyO3.
-"""
+from dynamo._internal.aic import AicSession, create_session

-import logging
-
-from aiconfigurator.sdk import config
-from aiconfigurator.sdk.backends.factory import get_backend
-from aiconfigurator.sdk.inference_session import InferenceSession
-from aiconfigurator.sdk.models import get_model
-from aiconfigurator.sdk.perf_database import get_database, get_supported_databases
-
-logger = logging.getLogger(__name__)
-
-DEFAULT_BACKEND_VERSIONS = {
-    "vllm": "0.12.0",
-    "sglang": "0.5.6.post2",
-}
-DEFAULT_STATIC_STRIDE = 32
-
-
-class AicSession:
-    """Wraps AIC InferenceSession with predict_prefill/predict_decode methods."""
-
-    def __init__(
-        self,
-        backend_name: str,
-        system: str,
-        model_path: str,
-        tp_size: int,
-        backend_version: str | None = None,
-        moe_tp_size: int | None = None,
-        moe_ep_size: int | None = None,
-        attention_dp_size: int | None = None,
-    ):
-        version = backend_version or DEFAULT_BACKEND_VERSIONS.get(
-            backend_name, DEFAULT_BACKEND_VERSIONS["vllm"]
-        )
-
-        database = get_database(system=system, backend=backend_name, version=version)
-        if database is None:
-            supported = get_supported_databases().get(system, {}).get(backend_name, [])
-            supported_versions = ", ".join(supported) if supported else "<none>"
-            raise RuntimeError(
-                "AIC perf database not found for "
-                f"system={system!r}, backend={backend_name!r}, version={version!r}. "
-                f"Supported versions for this system/backend: {supported_versions}"
-            )
-        # Build ModelConfig. For MoE models, aic_moe_tp_size, aic_moe_ep_size, and
-        # aic_attention_dp_size must be set to satisfy AIC's constraint:
-        #   tp_size * attention_dp_size == moe_tp_size * moe_ep_size
-        # AIC SDK validates this internally and raises a clear AssertionError if violated.
-        effective_dp = attention_dp_size or 1
-        model_config = config.ModelConfig(
-            tp_size=tp_size,
-            moe_tp_size=moe_tp_size,
-            moe_ep_size=moe_ep_size,
-            attention_dp_size=effective_dp,
-        )
-        model = get_model(
-            model_path=model_path,
-            model_config=model_config,
-            backend_name=backend_name,
-        )
-        backend = get_backend(backend_name)
-        self._session = InferenceSession(
-            model=model, database=database, backend=backend
-        )
-        self._database = database
-        self._model = model
-        # AIC models consistently expose model_path, but some do not surface model_name.
-        self._model_name = getattr(model, "model_name", None) or model_path
-        self._config = config
-        logger.info(
-            "AIC session initialized: backend=%s, system=%s, model=%s, tp=%d",
-            backend_name,
-            system,
-            model_path,
-            tp_size,
-        )
-
-    def _predict_context_latency(self, batch_size: int, isl: int, prefix: int) -> float:
-        effective_isl = isl - prefix
-        if effective_isl <= 0:
-            raise ValueError(
-                f"isl must be greater than prefix, got isl={isl}, prefix={prefix}"
-            )
-
-        total_latency = 0.0
-        for op in self._model.context_ops:
-            # AIC operations identify kernels via Operation._name; there is no public name accessor.
-            op_name = getattr(op, "_name", "")
-            x = batch_size if "logits_gemm" in op_name else batch_size * effective_isl
-            result = op.query(
-                self._database,
-                x=x,
-                batch_size=batch_size,
-                beam_width=1,
-                s=effective_isl,
-                prefix=prefix,
-                model_name=self._model_name,
-                seq_imbalance_correction_scale=1.0,
-            )
-            total_latency += float(result)
-
-        return total_latency
-
-    def _predict_generation_latency(self, batch_size: int, isl: int, osl: int) -> float:
-        if osl <= 1:
-            return 0.0
-
-        # BaseModel stores speculative decode width on _nextn, which generation_ops scale by.
-        effective_batch_size = batch_size * (self._model._nextn + 1)
-        total_latency = 0.0
-
-        for step in range(0, osl - 1, DEFAULT_STATIC_STRIDE):
-            step_latency = 0.0
-            for op in self._model.generation_ops:
-                result = op.query(
-                    self._database,
-                    x=effective_batch_size,
-                    batch_size=effective_batch_size,
-                    beam_width=1,
-                    s=isl + step + 1,
-                    model_name=self._model_name,
-                    gen_seq_imbalance_correction_scale=1.0,
-                )
-                step_latency += float(result)
-
-            repeat_count = min(DEFAULT_STATIC_STRIDE, osl - 1 - step)
-            total_latency += step_latency * repeat_count
-
-        return total_latency
-
-    def predict_prefill(
-        self, batch_size: int, isl: int, prefix: int, osl: int
-    ) -> float:
-        """Predict prefill latency in ms. Parameters match AIC RuntimeConfig."""
-        # AIC requires at least 1 new token (isl > prefix)
-        actual_prefix = min(prefix, isl - 1) if isl > 0 else 0
-        return self._predict_context_latency(batch_size, isl, actual_prefix)
-
-    def predict_decode(self, batch_size: int, isl: int, osl: int) -> float:
-        """Predict decode (generation) latency in ms."""
-        return self._predict_generation_latency(batch_size, isl, osl)
-
-
-def create_session(
-    backend_name: str,
-    system: str,
-    model_path: str,
-    tp_size: int,
-    backend_version: str | None = None,
-    moe_tp_size: int | None = None,
-    moe_ep_size: int | None = None,
-    attention_dp_size: int | None = None,
-) -> AicSession:
-    """Factory function called from Rust via PyO3."""
-    return AicSession(
-        backend_name,
-        system,
-        model_path,
-        tp_size,
-        backend_version,
-        moe_tp_size,
-        moe_ep_size,
-        attention_dp_size,
-    )
+__all__ = ["AicSession", "create_session"]
--- a/components/src/dynamo/router/__main__.py
+++ b/components/src/dynamo/router/__main__.py
@@ -18,8 +18,12 @@ from typing import Optional

 import uvloop

-from dynamo.llm import KvRouter, KvRouterConfig
-from dynamo.router.args import DynamoRouterConfig, build_kv_router_config
+from dynamo.llm import AicPerfConfig, KvRouter, KvRouterConfig
+from dynamo.router.args import (
+    DynamoRouterConfig,
+    build_aic_perf_config,
+    build_kv_router_config,
+)
 from dynamo.router.args import parse_args as parse_router_args
 from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -37,11 +41,13 @@ class StandaloneRouterHandler:
        worker_endpoint_path: str,
        block_size: int,
        kv_router_config: KvRouterConfig,
+        aic_perf_config: Optional[AicPerfConfig],
    ):
        self.runtime = runtime
        self.worker_endpoint_path = worker_endpoint_path
        self.block_size = block_size
        self.kv_router_config = kv_router_config
+        self.aic_perf_config = aic_perf_config
        self.kv_router: Optional[KvRouter] = None
        self.worker_client: Optional[Client] = None

@@ -67,6 +73,7 @@ class StandaloneRouterHandler:
                endpoint=worker_endpoint,
                block_size=self.block_size,
                kv_router_config=self.kv_router_config,
+                aic_perf_config=self.aic_perf_config,
            )

        except Exception as e:
@@ -178,10 +185,15 @@ async def worker(runtime: DistributedRuntime):
    )

    kv_router_config = build_kv_router_config(config)
+    aic_perf_config = build_aic_perf_config(config)

    # Create handler
    handler = StandaloneRouterHandler(
-        runtime, config.endpoint, config.router_block_size, kv_router_config
+        runtime,
+        config.endpoint,
+        config.router_block_size,
+        kv_router_config,
+        aic_perf_config,
    )
    await handler.initialize()


--- a/components/src/dynamo/router/args.py
+++ b/components/src/dynamo/router/args.py
@@ -7,15 +7,19 @@ import argparse
 from typing import Optional

 from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.groups.aic_perf_args import (
+    AicPerfArgGroup,
+    AicPerfConfigBase,
+)
 from dynamo.common.configuration.groups.kv_router_args import (
    KvRouterArgGroup,
    KvRouterConfigBase,
 )
 from dynamo.common.configuration.utils import add_argument
-from dynamo.llm import KvRouterConfig
+from dynamo.llm import AicPerfConfig, KvRouterConfig


-class DynamoRouterConfig(KvRouterConfigBase):
+class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
    """Typed configuration for the standalone KV router (router-owned options only)."""

    namespace: str
@@ -36,6 +40,25 @@ class DynamoRouterConfig(KvRouterConfigBase):
                "Expected format: namespace.component.endpoint"
            )
        self.namespace = parts[0]
+        if self.router_prefill_load_model == "aic":
+            missing = [
+                flag
+                for flag, value in (
+                    ("--aic-backend", self.aic_backend),
+                    ("--aic-system", self.aic_system),
+                    ("--aic-model-path", self.aic_model_path),
+                )
+                if not value
+            ]
+            if missing:
+                raise ValueError(
+                    "--router-prefill-load-model=aic requires " + ", ".join(missing)
+                )
+            if not self.router_track_prefill_tokens:
+                raise ValueError(
+                    "--router-prefill-load-model=aic requires "
+                    "--router-track-prefill-tokens"
+                )


 class DynamoRouterArgGroup(ArgGroup):
@@ -68,6 +91,7 @@ class DynamoRouterArgGroup(ArgGroup):

        # KV router options (shared with dynamo.frontend)
        KvRouterArgGroup().add_arguments(parser)
+        AicPerfArgGroup().add_arguments(parser)


 def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
@@ -75,6 +99,14 @@ def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
    return KvRouterConfig(**router_config.kv_router_kwargs())


+def build_aic_perf_config(
+    router_config: DynamoRouterConfig,
+) -> AicPerfConfig | None:
+    if router_config.router_prefill_load_model != "aic":
+        return None
+    return AicPerfConfig(**router_config.aic_perf_kwargs())
+
+
 def parse_args(argv: Optional[list[str]] = None) -> DynamoRouterConfig:
    """Parse command-line arguments for the standalone router.


--- a/docs/benchmarks/kv-router-ab-testing.md
+++ b/docs/benchmarks/kv-router-ab-testing.md
@@ -392,6 +392,13 @@ For this A/B comparison, we use the [**Mooncake FAST'25 Toolagent Trace**](https

 These two requests share blocks 46–57 (12 blocks × 512 tokens = ~6,144 tokens of shared prefix) — a tool agent continuing the same session with accumulated context. Each hash ID represents a **512-token block**, and the hash includes both the current block and all preceding blocks, preserving the pattern of prefix reuse while protecting user privacy. The **KV Smart Router** routes requests with matching hash IDs to the same worker, maximizing cache hits.

+If you reproduce this benchmark with `python -m dynamo.replay`, keep that dataset fact separate from
+the replay engine configuration:
+
+- use `--trace-block-size 512` for the Mooncake/toolagent trace itself
+- keep engine `block_size` in `--extra-engine-args` aligned with the runtime you want to mimic
+  (for the published vLLM deployment, that is typically `64`)
+
 **Key Dataset Properties:**
 - ✅ **Realistic timing:** Request arrival patterns from production tool-agent workloads
 - ✅ **High prefix overlap:** 59% cache ratio ([Mooncake FAST'25 paper](https://github.com/kvcache-ai/Mooncake/blob/main/FAST25-release/Mooncake-FAST25.pdf)); iterative tool calls within sessions produce natural prefix reuse

--- a/docs/benchmarks/mocker-trace-replay.md
+++ b/docs/benchmarks/mocker-trace-replay.md
@@ -28,7 +28,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --num-workers 4 \
    --replay-mode offline \
    --router-mode round_robin \
-    --extra-engine-args '{"block_size":512}' \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
    --report-json /tmp/replay-report.json
 ```

@@ -99,13 +100,17 @@ Example:
 {"session_id":"session-b","delay_ms":50,"input_length":1536,"output_length":64,"hash_ids":[9,10,11]}
 ```

-The mocker synthesizes token blocks from `hash_ids` using the configured mocker `block_size`, so the
-replay block size must match the block size used when the trace was generated. Public Mooncake
-traces are commonly block-level hashes at `512` tokens per hash ID, so replaying them with the
-default mocker `block_size=64` will fail once `input_length > len(hash_ids) * 64`. Set that
-through `--extra-engine-args '{"block_size":512}'`. For `engine_type=sglang`, replay still uses
-canonical `block_size` internally; `sglang.page_size` is accepted as a compatibility alias and is
-normalized into `block_size` before replay starts.
+Replay uses two different block-size concepts for trace files:
+
+- `--trace-block-size`: how many tokens each `hash_id` in the dataset represents
+- engine `block_size`: the block size used by the replay engine and router when they re-chunk the
+  synthesized tokens into sequence hashes
+
+Public Mooncake/toolagent traces use `512` tokens per `hash_id`, so replaying them should normally
+use `--trace-block-size 512`. The engine `block_size` can still be smaller, for example the live
+vLLM benchmark setup uses `block_size=64`. For `engine_type=sglang`, replay still uses canonical
+`block_size` internally; `sglang.page_size` is accepted as a compatibility alias and is normalized
+into `block_size` before replay starts.

 ## Replay Surfaces

@@ -122,6 +127,7 @@ The dedicated replay CLI exposes:
 - `--replay-concurrency`
 - `--arrival-interval-ms`
 - `--arrival-speedup-ratio`
+- `--trace-block-size`
 - `--turns-per-session`
 - `--shared-prefix-ratio`
 - `--num-prefix-groups`
@@ -130,6 +136,11 @@ The dedicated replay CLI exposes:
 - `--prefill-engine-args` (JSON string)
 - `--decode-engine-args` (JSON string)
 - `--router-config` (JSON string)
+- `--aic-backend`
+- `--aic-system`
+- `--aic-backend-version`
+- `--aic-tp-size`
+- `--aic-model-path`
 - `--report-json`

 Defaults:
@@ -145,7 +156,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --router-mode kv_router \
    --num-workers 4 \
    --arrival-speedup-ratio 10 \
-    --extra-engine-args '{"block_size":512}' \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
    --router-config '{"router_queue_policy":"fcfs","router_temperature":0.0}' \
    --report-json /tmp/replay-report.json
 ```
@@ -165,8 +177,15 @@ SGLang replay uses the same CLI surface. A minimal extra-engine-args file can us

 Both `--extra-engine-args` and `--router-config` accept partial JSON objects. Engine settings such
 as `block_size`, `engine_type`, `dp_size`, `speedup_ratio`, and `decode_speedup_ratio` belong in
-`--extra-engine-args`, not as top-level replay CLI flags. Unspecified fields fall back to the same
-defaults used by `MockEngineArgs::default()` and `KvRouterConfig::default()`.
+`--extra-engine-args`, not as top-level replay CLI flags. `--trace-block-size` is separate and is
+used only for trace-file replay. Unspecified fields fall back to the same defaults used by
+`MockEngineArgs::default()` and `KvRouterConfig::default()`.
+
+Replay has two independent AIC surfaces:
+
+- engine timing AIC via `--extra-engine-args` / staged engine JSON
+- router-side prompt-load AIC via top-level `--aic-*` flags together with
+  `router_prefill_load_model: "aic"` in `--router-config`

 Offline disagg replay uses staged engine args instead of `--extra-engine-args`:

@@ -179,7 +198,8 @@ For offline disagg replay, the staged JSON must set `worker_type` explicitly:
 - `--prefill-engine-args` must use `worker_type: "prefill"`
 - `--decode-engine-args` must use `worker_type: "decode"`

-The staged configs must also use the same `block_size`.
+The staged configs must also use the same engine `block_size`. `--trace-block-size` remains a
+separate trace-file input knob.

 ### Synthetic Replay

@@ -223,7 +243,8 @@ those timestamps:
 python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --replay-mode offline \
    --num-workers 4 \
-    --extra-engine-args '{"block_size":512}'
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}'
 ```

 This is the right mode when you want deterministic replay of the original arrival pattern.
@@ -261,7 +282,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --router-mode kv_router \
    --num-workers 4 \
    --arrival-speedup-ratio 10 \
-    --extra-engine-args '{"block_size":512}'
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}'
 ```

 ### Arrival Speedup
@@ -274,7 +296,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --replay-mode offline \
    --num-workers 4 \
    --arrival-speedup-ratio 5 \
-    --extra-engine-args '{"block_size":512}'
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}'
 ```

 ### Router Modes
@@ -292,6 +315,8 @@ provided through `--router-config`, not a dedicated top-level replay flag. In of
 - KV visibility is delayed slightly relative to request lifecycle events
 - queue admission is driven by router lifecycle edges (`add_request`, `mark_prefill_completed`, and `free`)
 - transient in-pass prefill occupancy is still approximated at the router level rather than modeled exactly
+- when `router_prefill_load_model` is `"aic"`, replay predicts one expected prefill duration per
+  admitted request and decays only the oldest active prefill request on each worker

 To compare queue policies manually, keep the same trace and engine args fixed and swap only
 `router_queue_policy` inside `--router-config`:
@@ -301,20 +326,41 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --replay-mode offline \
    --router-mode kv_router \
    --num-workers 4 \
-    --extra-engine-args '{"block_size":512}' \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
    --router-config '{"router_queue_policy":"fcfs"}'

 python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --replay-mode offline \
    --router-mode kv_router \
    --num-workers 4 \
-    --extra-engine-args '{"block_size":512}' \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
    --router-config '{"router_queue_policy":"lcfs"}'
 ```

 `lcfs` is intentionally a worse comparison policy under saturation; use it for experiments, not as
 an expected production default.

+To enable router-side AIC prefill-load modeling during replay:
+
+```bash
+python -m dynamo.replay /path/to/mooncake_trace.jsonl \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --num-workers 4 \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
+    --router-config '{"router_track_prefill_tokens":true,"router_prefill_load_model":"aic"}' \
+    --aic-backend vllm \
+    --aic-system h200_sxm \
+    --aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
+    --aic-tp-size 1
+```
+
+For offline disagg replay, the same top-level `--aic-*` flags are supported, but the estimator is
+applied only to the prefill-stage router.
+
 ## Output

 The report contains:
@@ -366,14 +412,18 @@ If you violate those constraints, replay fails immediately with a validation err
 - mocker compute-speed knobs such as `speedup_ratio` still affect simulated timing when passed via
  the engine-args JSON for the chosen replay mode
 - `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
+- `--trace-block-size` affects only how trace `hash_ids` expand into tokens
 - `--arrival-interval-ms` only applies to synthetic replay
 - `--turns-per-session`, `--shared-prefix-ratio`, `--num-prefix-groups`, and
  `--inter-turn-delay-ms` only apply to synthetic replay
 - `--extra-engine-args`, `--prefill-engine-args`, `--decode-engine-args`, and `--router-config`
  are JSON strings on the standalone replay CLI
+- top-level `--aic-*` flags are used only for router-side prompt-load modeling; engine timing AIC
+  still belongs in the engine-args JSON
 - offline replay does not need planner runtime setup, router registration, or external event transport
- the replay block size should match the trace block size, because token synthesis expands `hash_ids`
-  using the configured block size
+- trace-file replay can use different values for `--trace-block-size` and engine `block_size`
+- Mooncake/toolagent traces typically use `--trace-block-size 512`, while engine `block_size`
+  often stays `64`

 ## When To Use This vs AIPerf


--- a/docs/components/frontend/configuration.md
+++ b/docs/components/frontend/configuration.md
@@ -42,11 +42,27 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
 | `--router-track-active-blocks` / `--no-router-track-active-blocks` | `DYN_ROUTER_TRACK_ACTIVE_BLOCKS` | `true` | Track blocks used by in-progress requests for load balancing |
 | `--router-assume-kv-reuse` / `--no-router-assume-kv-reuse` | `DYN_ROUTER_ASSUME_KV_REUSE` | `true` | Assume KV cache reuse when tracking active blocks |
 | `--router-track-output-blocks` / `--no-router-track-output-blocks` | `DYN_ROUTER_TRACK_OUTPUT_BLOCKS` | `false` | Track output blocks with fractional decay during generation |
+| `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens` | `DYN_ROUTER_TRACK_PREFILL_TOKENS` | `true` | Track prompt-side prefill load in worker load accounting |
+| `--router-prefill-load-model` | `DYN_ROUTER_PREFILL_LOAD_MODEL` | `none` | Prompt-side load model: `none` for static load, `aic` for oldest-prefill decay using an AIC prediction |
 | `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
 | `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
 | `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
 | `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |

+## AIC Prefill Load Model
+
+These options are used only when `--router-mode kv` and `--router-prefill-load-model aic` are enabled.
+
+| CLI Argument | Env Var | Default | Description |
+|-------------|---------|---------|-------------|
+| `--aic-backend` | `DYN_AIC_BACKEND` | — | Backend family to model in AIC, for example `vllm` or `sglang` |
+| `--aic-system` | `DYN_AIC_SYSTEM` | — | AIC hardware/system identifier, for example `h200_sxm` |
+| `--aic-model-path` | `DYN_AIC_MODEL_PATH` | — | Model path or model identifier used for AIC perf lookup |
+| `--aic-backend-version` | `DYN_AIC_BACKEND_VERSION` | backend-specific | Pinned AIC database version. If omitted, Dynamo uses the backend default |
+| `--aic-tp-size` | `DYN_AIC_TP_SIZE` | `1` | Tensor-parallel size to model in AIC |
+
+When enabled, the frontend's embedded KV router predicts one expected prefill duration per admitted request, using the selected worker's overlap-derived cached prefix. The router then decays only the oldest active prefill request on each worker for prompt-side load accounting.
+
 ## Fault Tolerance

 | CLI Argument | Env Var | Default | Description |

--- a/docs/components/router/router-guide.md
+++ b/docs/components/router/router-guide.md
@@ -89,6 +89,8 @@ Backend workers register themselves using the `register_model` API, after which
 | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
 | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
 | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
+| `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens` | `--router-track-prefill-tokens` | Include prompt-side load in active worker load accounting |
+| `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration |
 | `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
 | `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |

@@ -96,6 +98,52 @@ For all available options: `python -m dynamo.frontend --help`

 For detailed configuration options and tuning parameters, see [Advanced Router Usage](#advanced-router-usage).

+#### AIC Prefill Load Model
+
+The KV router can use AIC to estimate the expected duration of the selected worker's prompt-side prefill work. When enabled, the router:
+
+- computes `prefix = overlap_blocks * block_size` for the chosen worker
+- computes `effective_isl = input_tokens - prefix`
+- stores one prompt-load hint for the admitted request
+- decays only the **oldest** active prefill request on each worker over time
+
+This affects router-side prompt load accounting only. It does not change backend execution or decode-side accounting.
+
+Enable it on the frontend like this:
+
+```bash
+python -m dynamo.frontend \
+    --router-mode kv \
+    --router-prefill-load-model aic \
+    --aic-backend vllm \
+    --aic-system h200_sxm \
+    --aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8
+```
+
+The standalone router uses the same AIC flags:
+
+```bash
+python -m dynamo.router \
+    --endpoint dynamo.prefill.generate \
+    --router-prefill-load-model aic \
+    --aic-backend vllm \
+    --aic-system h200_sxm \
+    --aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8
+```
+
+Required when `--router-prefill-load-model=aic` is enabled:
+
+- `--router-mode kv` on the frontend
+- `--router-track-prefill-tokens`
+- `--aic-backend`
+- `--aic-system`
+- `--aic-model-path`
+
+Optional AIC knobs:
+
+- `--aic-backend-version`: pinned AIC database version; if omitted, Dynamo uses a backend-specific default
+- `--aic-tp-size`: tensor-parallel size for the modeled backend; defaults to `1`
+
 ### Kubernetes Deployment

 To enable the KV Router in Kubernetes, add the `DYN_ROUTER_MODE` environment variable to your frontend service:
@@ -235,6 +283,10 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na

 - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.

+- `--router-track-prefill-tokens`: Enables prompt-side load accounting in the worker cost model. This should stay enabled if you want queue thresholds, `active_prefill_tokens`, and AIC prefill load decay to reflect prompt work.
+
+- `--router-prefill-load-model`: Selects the router's prompt-side load model. `none` keeps the existing static prompt load accounting. `aic` predicts one expected prefill duration per admitted request and lazily decays only the oldest active prefill request on each worker.
+
 - `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 4.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).

 - `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Three policies are available:
@@ -292,6 +344,8 @@ Use `--router-track-output-blocks` **(experimental)** when your workload is outp

 The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.

+Use `--router-prefill-load-model aic` **(experimental)** when you want prompt-side load tracking to decay the oldest active prefill request using an AIC-predicted duration instead of keeping prompt load static until first token. This requires `--router-track-prefill-tokens` and the shared `--aic-*` config (`--aic-backend`, `--aic-system`, and `--aic-model-path`; `--aic-tp-size` defaults to `1`, and `--aic-backend-version` is optional). This path is still experimental because the decay model is based on expected prefill duration rather than observed worker-side progress.
+
 Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time.

 ### Prometheus Metrics

--- a/docs/mocker/mocker.md
+++ b/docs/mocker/mocker.md
@@ -139,7 +139,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
    --replay-mode offline \
    --router-mode kv_router \
    --arrival-speedup-ratio 5 \
-    --extra-engine-args '{"block_size":512}' \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
    --router-config '{"router_queue_policy":"fcfs"}' \
    --report-json /tmp/replay-report.json
 ```
@@ -184,6 +185,11 @@ first turn uses `timestamp`/`created_time`; later turns can use `delay` or `dela
 {"session_id":"session-a","delay":250,"input_length":2560,"output_length":128,"hash_ids":[1,2,3,4,5]}
 ```

+For trace-file replay, `--trace-block-size` controls how many tokens each `hash_id` represents in
+the dataset, while engine `block_size` still controls the replay engine and router hashing. Public
+Mooncake/toolagent traces use `--trace-block-size 512`; engine `block_size` can still stay at `64`
+to match the live runtime configuration.
+
 The standalone replay CLI prints an AIPerf-style summary table to stdout and writes the full replay
 report JSON to disk.

@@ -264,12 +270,19 @@ The AIC model automatically uses `--model-path` and `--engine-type` to select th
 Important notes:

 - AIC is opt-in. If you do not pass `--aic-perf-model`, `python -m dynamo.mocker` does not use AIC.
- `python -m dynamo.replay` also does not use AIC unless you explicitly put AIC fields in the engine-args JSON.
+- `python -m dynamo.replay` has two separate AIC surfaces:
+  - engine timing AIC through `--extra-engine-args` / staged engine JSON
+  - router-side prefill-load AIC through top-level `--aic-*` flags plus `router_prefill_load_model="aic"` in `--router-config`
+- The Python AIC session bridge is now shared with the live KV router path via the internal `dynamo._internal.aic` module. Mocker CLI behavior is unchanged; this just removes duplicate AIC session code.
 - `aiconfigurator` must be able to load the requested performance database for the selected `system/backend/version`. If the SDK is installed but the backing systems data is missing or unreadable, mocker now fails fast at startup with a clear error instead of failing later on first request.
 - In development environments, this may require pointing Python at a source checkout of `aiconfigurator` with real Git LFS payloads materialized in its `systems/` directory.

-When using `python -m dynamo.replay`, there are no dedicated AIC flags. For aggregated replay,
-pass the equivalent fields via `--extra-engine-args`:
+This mocker AIC path is separate from the router-side prefill-load estimator. Live router,
+frontend, and replay all use `router_prefill_load_model="aic"` plus top-level `--aic-*` flags for
+oldest-prefill prompt-load decay. Replay still uses engine-args AIC separately when you want the
+mocked worker timing model itself to come from AIC.
+
+For aggregated replay, engine timing AIC still comes from `--extra-engine-args`:

 ```bash
 python -m dynamo.replay /path/to/trace.jsonl \
@@ -290,6 +303,25 @@ python -m dynamo.replay /path/to/trace.jsonl \

 The `aic_backend` field enables the AIC perf model and should match `engine_type` (`"vllm"` or `"sglang"`). The `aic_model_path` field is the equivalent of `--model-path` in `dynamo.mocker`.

+Replay router-side AIC prompt-load modeling is configured separately with top-level flags:
+
+```bash
+python -m dynamo.replay /path/to/trace.jsonl \
+    --replay-mode offline \
+    --router-mode kv_router \
+    --num-workers 4 \
+    --trace-block-size 512 \
+    --extra-engine-args '{"block_size":64}' \
+    --router-config '{"router_track_prefill_tokens":true,"router_prefill_load_model":"aic"}' \
+    --aic-backend vllm \
+    --aic-system h200_sxm \
+    --aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
+    --aic-tp-size 1
+```
+
+For offline disagg replay, the same top-level `--aic-*` flags drive the prefill-stage router only;
+the decode-stage router keeps prompt tracking disabled.
+
 Example `--reasoning` configuration:

 ```bash

--- a/lib/bench/kv_router/active_sequences_bench.rs
+++ b/lib/bench/kv_router/active_sequences_bench.rs
@@ -366,6 +366,7 @@ async fn apply_entry(
    worker: WorkerWithDpRank,
    entry: SequenceTraceEntry,
 ) {
+    let decay_now = tokio::time::Instant::now();
    match entry {
        SequenceTraceEntry::Add {
            request_id,
@@ -377,23 +378,28 @@ async fn apply_entry(
                Some(&block_hashes),
                isl,
                OverlapScores::default(),
+                decay_now,
+            );
+            let _ = multi.add_request(
+                SequenceRequest {
+                    request_id,
+                    token_sequence: Some(block_hashes),
+                    isl,
+                    overlap: 0,
+                    track_prefill_tokens: true,
+                    expected_output_tokens: Some(output_length as u32),
+                    prefill_load_hint: None,
+                    worker,
+                    lora_name: None,
+                },
+                decay_now,
            );
-            let _ = multi.add_request(SequenceRequest {
-                request_id,
-                token_sequence: Some(block_hashes),
-                isl,
-                overlap: 0,
-                track_prefill_tokens: true,
-                expected_output_tokens: Some(output_length as u32),
-                worker,
-                lora_name: None,
-            });
        }
        SequenceTraceEntry::PrefillComplete { request_id } => {
-            let _ = multi.mark_prefill_completed(&request_id);
+            let _ = multi.mark_prefill_completed(&request_id, decay_now);
        }
        SequenceTraceEntry::Free { request_id } => {
-            let _ = multi.free(&request_id);
+            let _ = multi.free(&request_id, decay_now);
        }
    }
 }

--- a/lib/bench/src/bin/offline_replay_bench.rs
+++ b/lib/bench/src/bin/offline_replay_bench.rs
@@ -49,8 +49,12 @@ struct Args {
    #[arg(long, default_value_t = 4.0)]
    arrival_speedup_ratio: f64,

-    /// Mocker block size; defaults to 512 for Mooncake traces
+    /// Trace hash block size used to expand hash_ids into tokens
    #[arg(long, default_value_t = 512)]
+    trace_block_size: usize,
+
+    /// Engine/router block size used for replay hashing and mock execution
+    #[arg(long, default_value_t = 64)]
    block_size: usize,

    /// Override max running requests per worker
@@ -115,7 +119,9 @@ fn main() -> Result<()> {
        last_report = Some(simulate_trace_file_with_router_mode(
            engine_args.clone(),
            None,
+            None,
            &args.trace_file,
+            args.trace_block_size,
            args.num_workers,
            args.arrival_speedup_ratio,
            args.router_mode.into(),

--- a/lib/bindings/c/Cargo.toml
+++ b/lib/bindings/c/Cargo.toml
@@ -30,7 +30,6 @@ crate-type = ["cdylib", "staticlib"]
 cbindgen = "0.27"

 [dependencies]
-dynamo-llm = { path = "../../llm" }
 dynamo-kv-router = { path = "../../kv-router" }
 dynamo-runtime = { path = "../../runtime" }

@@ -43,3 +42,9 @@ async-once-cell = { version = "0.5.4" }
 libc = { version = "0.2" }
 once_cell = { version = "1" }
 tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+dynamo-llm = { path = "../../llm" }
+
+[target.'cfg(not(target_os = "linux"))'.dependencies]
+dynamo-llm = { path = "../../llm", default-features = false }
--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -716,6 +716,7 @@ pub unsafe extern "C" fn create_routers(
                &endpoint,
                block_size,
                Some(kv_router_config.clone()),
+                None,
                WORKER_TYPE_DECODE,
                Some(model_name.clone()),
                enable_eagle,
@@ -781,6 +782,7 @@ pub unsafe extern "C" fn create_routers(
            RouterMode::KV,
            block_size,
            Some(prefill_config),
+            None,
            enforce_disagg,
            model_name.clone(),
            actual_namespace.clone(),

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -131,9 +131,9 @@ dependencies = [

 [[package]]
 name = "arc-swap"
-version = "1.9.0"
+version = "1.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6"
+checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
 dependencies = [
 "rustversion",
 ]
@@ -920,6 +920,18 @@ dependencies = [
 "windows-sys 0.59.0",
 ]

+[[package]]
+name = "console"
+version = "0.16.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "unicode-width",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "const-oid"
 version = "0.9.6"
@@ -1410,7 +1422,7 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
 dependencies = [
- "console",
+ "console 0.15.11",
 "shell-words",
 "tempfile",
 "thiserror 1.0.69",
@@ -1655,6 +1667,7 @@ dependencies = [
 "derive_builder",
 "dynamo-kv-router",
 "dynamo-tokens",
+ "indicatif 0.18.4",
 "ndarray",
 "ndarray-interp",
 "ndarray-npy",
@@ -2037,11 +2050,11 @@ dependencies = [

 [[package]]
 name = "fastrand"
-version = "2.3.0"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
 dependencies = [
- "getrandom 0.2.17",
+ "getrandom 0.3.4",
 ]

 [[package]]
@@ -2489,7 +2502,7 @@ dependencies = [
 "dirs",
 "futures",
 "http",
- "indicatif",
+ "indicatif 0.17.11",
 "libc",
 "log",
 "num_cpus",
@@ -2865,13 +2878,26 @@ version = "0.17.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
 dependencies = [
- "console",
+ "console 0.15.11",
 "number_prefix",
 "portable-atomic",
 "unicode-width",
 "web-time",
 ]

+[[package]]
+name = "indicatif"
+version = "0.18.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
+dependencies = [
+ "console 0.16.3",
+ "portable-atomic",
+ "unicode-width",
+ "unit-prefix",
+ "web-time",
+]
+
 [[package]]
 name = "indoc"
 version = "2.0.7"
@@ -7239,6 +7265,12 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"

+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
+
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -938,6 +938,18 @@ dependencies = [
 "windows-sys 0.59.0",
 ]

+[[package]]
+name = "console"
+version = "0.16.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
+dependencies = [
+ "encode_unicode",
+ "libc",
+ "unicode-width",
+ "windows-sys 0.61.2",
+]
+
 [[package]]
 name = "const-oid"
 version = "0.9.6"
@@ -1428,7 +1440,7 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
 dependencies = [
- "console",
+ "console 0.15.11",
 "shell-words",
 "tempfile",
 "thiserror 1.0.69",
@@ -1670,6 +1682,7 @@ dependencies = [
 "derive_builder",
 "dynamo-kv-router",
 "dynamo-tokens",
+ "indicatif 0.18.4",
 "ndarray",
 "ndarray-interp",
 "ndarray-npy",
@@ -2561,7 +2574,7 @@ dependencies = [
 "dirs",
 "futures",
 "http",
- "indicatif",
+ "indicatif 0.17.11",
 "libc",
 "log",
 "num_cpus",
@@ -2937,13 +2950,26 @@ version = "0.17.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
 dependencies = [
- "console",
+ "console 0.15.11",
 "number_prefix",
 "portable-atomic",
 "unicode-width",
 "web-time",
 ]

+[[package]]
+name = "indicatif"
+version = "0.18.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
+dependencies = [
+ "console 0.16.3",
+ "portable-atomic",
+ "unicode-width",
+ "unit-prefix",
+ "web-time",
+]
+
 [[package]]
 name = "indoc"
 version = "2.0.7"
@@ -7309,6 +7335,12 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"

+[[package]]
+name = "unit-prefix"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
+
 [[package]]
 name = "unsafe-libyaml"
 version = "0.2.11"

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -169,6 +169,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<llm::entrypoint::EntrypointArgs>()?;
    m.add_class::<llm::entrypoint::EngineConfig>()?;
    m.add_class::<llm::entrypoint::EngineType>()?;
+    m.add_class::<llm::entrypoint::AicPerfConfig>()?;
    m.add_class::<llm::entrypoint::RouterConfig>()?;
    m.add_class::<llm::entrypoint::KvRouterConfig>()?;
    m.add_class::<llm::replay::ReasoningConfig>()?;