"docs/vscode:/vscode.git/clone" did not exist on "28546bad2602e67eb9436de6198776855ba08040"
Unverified Commit 95a750f4 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(replay): refactor offline components into cleaner lanes (#7866)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 210bbf5d
......@@ -2018,6 +2018,7 @@ dependencies = [
"derive_builder",
"dynamo-kv-router",
"dynamo-tokens",
"indicatif 0.18.4",
"ndarray 0.16.1",
"ndarray-interp",
"ndarray-npy",
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Shared AIC perf-model configuration ArgGroup."""
from typing import Optional
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import add_argument
_AIC_PERF_FIELDS: tuple[str, ...] = (
"aic_backend",
"aic_system",
"aic_backend_version",
"aic_tp_size",
"aic_model_path",
)
class AicPerfConfigBase(ConfigBase):
aic_backend: Optional[str]
aic_system: Optional[str]
aic_backend_version: Optional[str]
aic_tp_size: int
aic_model_path: Optional[str]
def aic_perf_kwargs(self) -> dict:
return {field: getattr(self, field) for field in _AIC_PERF_FIELDS}
class AicPerfArgGroup(ArgGroup):
def add_arguments(self, parser) -> None:
g = parser.add_argument_group("AIC Perf Model Options")
add_argument(
g,
flag_name="--aic-backend",
env_var="DYN_AIC_BACKEND",
default=None,
help=(
"[EXPERIMENTAL] AIC backend family to model "
"(for example: vllm or sglang)."
),
)
add_argument(
g,
flag_name="--aic-system",
env_var="DYN_AIC_SYSTEM",
default=None,
help=(
"[EXPERIMENTAL] AIC hardware/system identifier "
"(for example: h200_sxm)."
),
)
add_argument(
g,
flag_name="--aic-backend-version",
env_var="DYN_AIC_BACKEND_VERSION",
default=None,
help="[EXPERIMENTAL] Pinned backend version for AIC database lookup.",
)
add_argument(
g,
flag_name="--aic-tp-size",
env_var="DYN_AIC_TP_SIZE",
default=1,
help="[EXPERIMENTAL] Tensor parallel size to model in AIC.",
arg_type=int,
)
add_argument(
g,
flag_name="--aic-model-path",
env_var="DYN_AIC_MODEL_PATH",
default=None,
help=(
"[EXPERIMENTAL] Model path or model identifier to use for "
"AIC perf lookup."
),
)
......@@ -27,6 +27,7 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
"router_track_output_blocks",
"router_assume_kv_reuse",
"router_track_prefill_tokens",
"router_prefill_load_model",
"router_snapshot_threshold",
"router_reset_states",
"router_ttl_secs",
......@@ -51,6 +52,7 @@ class KvRouterConfigBase(ConfigBase):
router_track_output_blocks: bool
router_assume_kv_reuse: bool
router_track_prefill_tokens: bool
router_prefill_load_model: str
router_snapshot_threshold: int
router_reset_states: bool
router_ttl_secs: float
......@@ -183,6 +185,18 @@ class KvRouterArgGroup(ArgGroup):
"prefill-token load, queue pressure, and active_prefill_tokens metrics."
),
)
add_argument(
g,
flag_name="--router-prefill-load-model",
env_var="DYN_ROUTER_PREFILL_LOAD_MODEL",
default="none",
choices=["none", "aic"],
help=(
"[EXPERIMENTAL] KV Router: Prompt-side prefill load model. "
"'none' keeps static prompt load accounting. "
"'aic' decays the oldest active prefill request using AIC-predicted duration."
),
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
......
......@@ -8,6 +8,10 @@ from typing import Any, Dict, Optional
from dynamo.common.config_dump import register_encoder
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.groups.aic_perf_args import (
AicPerfArgGroup,
AicPerfConfigBase,
)
from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
......@@ -39,7 +43,7 @@ def validate_model_path(value: str) -> str:
return value
class FrontendConfig(KvRouterConfigBase):
class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
"""Configuration for the Dynamo frontend."""
interactive: bool
......@@ -98,6 +102,34 @@ class FrontendConfig(KvRouterConfigBase):
f"--tokenizer: invalid value '{self.tokenizer_backend}' "
f"(choose from {sorted(self._VALID_TOKENIZER_BACKENDS)})"
)
if self.router_prefill_load_model == "aic":
if self.router_mode != "kv":
raise ValueError(
"--router-prefill-load-model=aic requires --router-mode=kv"
)
if self.chat_processor != "dynamo":
raise ValueError(
"--router-prefill-load-model=aic currently requires "
"--dyn-chat-processor=dynamo"
)
missing = [
flag
for flag, value in (
("--aic-backend", self.aic_backend),
("--aic-system", self.aic_system),
("--aic-model-path", self.aic_model_path),
)
if not value
]
if missing:
raise ValueError(
"--router-prefill-load-model=aic requires " + ", ".join(missing)
)
if not self.router_track_prefill_tokens:
raise ValueError(
"--router-prefill-load-model=aic requires "
"--router-track-prefill-tokens"
)
@register_encoder(FrontendConfig)
......@@ -214,6 +246,7 @@ class FrontendArgGroup(ArgGroup):
# KV router options (shared with dynamo.router)
KvRouterArgGroup().add_arguments(parser)
AicPerfArgGroup().add_arguments(parser)
add_argument(
g,
......
......@@ -29,6 +29,7 @@ import uvloop
from dynamo.common.config_dump import dump_config
from dynamo.llm import (
AicPerfConfig,
EngineType,
EntrypointArgs,
KvRouterConfig,
......@@ -302,6 +303,9 @@ async def async_main():
).chat_engine_factory
kwargs["chat_engine_factory"] = chat_engine_factory
if config.router_prefill_load_model == "aic":
kwargs["aic_perf_config"] = AicPerfConfig(**config.aic_perf_kwargs())
e = EntrypointArgs(EngineType.Dynamic, **kwargs)
engine = await make_engine(runtime, e)
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
AIC (AI Configurator) direct session wrapper for mocker perf model.
"""Backward-compatible mocker wrapper around the shared internal AIC bridge."""
Provides a Python class that wraps the AIC InferenceSession and exposes
predict_prefill() and predict_decode() methods callable from Rust via PyO3.
"""
from dynamo._internal.aic import AicSession, create_session
import logging
from aiconfigurator.sdk import config
from aiconfigurator.sdk.backends.factory import get_backend
from aiconfigurator.sdk.inference_session import InferenceSession
from aiconfigurator.sdk.models import get_model
from aiconfigurator.sdk.perf_database import get_database, get_supported_databases
logger = logging.getLogger(__name__)
DEFAULT_BACKEND_VERSIONS = {
"vllm": "0.12.0",
"sglang": "0.5.6.post2",
}
DEFAULT_STATIC_STRIDE = 32
class AicSession:
"""Wraps AIC InferenceSession with predict_prefill/predict_decode methods."""
def __init__(
self,
backend_name: str,
system: str,
model_path: str,
tp_size: int,
backend_version: str | None = None,
moe_tp_size: int | None = None,
moe_ep_size: int | None = None,
attention_dp_size: int | None = None,
):
version = backend_version or DEFAULT_BACKEND_VERSIONS.get(
backend_name, DEFAULT_BACKEND_VERSIONS["vllm"]
)
database = get_database(system=system, backend=backend_name, version=version)
if database is None:
supported = get_supported_databases().get(system, {}).get(backend_name, [])
supported_versions = ", ".join(supported) if supported else "<none>"
raise RuntimeError(
"AIC perf database not found for "
f"system={system!r}, backend={backend_name!r}, version={version!r}. "
f"Supported versions for this system/backend: {supported_versions}"
)
# Build ModelConfig. For MoE models, aic_moe_tp_size, aic_moe_ep_size, and
# aic_attention_dp_size must be set to satisfy AIC's constraint:
# tp_size * attention_dp_size == moe_tp_size * moe_ep_size
# AIC SDK validates this internally and raises a clear AssertionError if violated.
effective_dp = attention_dp_size or 1
model_config = config.ModelConfig(
tp_size=tp_size,
moe_tp_size=moe_tp_size,
moe_ep_size=moe_ep_size,
attention_dp_size=effective_dp,
)
model = get_model(
model_path=model_path,
model_config=model_config,
backend_name=backend_name,
)
backend = get_backend(backend_name)
self._session = InferenceSession(
model=model, database=database, backend=backend
)
self._database = database
self._model = model
# AIC models consistently expose model_path, but some do not surface model_name.
self._model_name = getattr(model, "model_name", None) or model_path
self._config = config
logger.info(
"AIC session initialized: backend=%s, system=%s, model=%s, tp=%d",
backend_name,
system,
model_path,
tp_size,
)
def _predict_context_latency(self, batch_size: int, isl: int, prefix: int) -> float:
effective_isl = isl - prefix
if effective_isl <= 0:
raise ValueError(
f"isl must be greater than prefix, got isl={isl}, prefix={prefix}"
)
total_latency = 0.0
for op in self._model.context_ops:
# AIC operations identify kernels via Operation._name; there is no public name accessor.
op_name = getattr(op, "_name", "")
x = batch_size if "logits_gemm" in op_name else batch_size * effective_isl
result = op.query(
self._database,
x=x,
batch_size=batch_size,
beam_width=1,
s=effective_isl,
prefix=prefix,
model_name=self._model_name,
seq_imbalance_correction_scale=1.0,
)
total_latency += float(result)
return total_latency
def _predict_generation_latency(self, batch_size: int, isl: int, osl: int) -> float:
if osl <= 1:
return 0.0
# BaseModel stores speculative decode width on _nextn, which generation_ops scale by.
effective_batch_size = batch_size * (self._model._nextn + 1)
total_latency = 0.0
for step in range(0, osl - 1, DEFAULT_STATIC_STRIDE):
step_latency = 0.0
for op in self._model.generation_ops:
result = op.query(
self._database,
x=effective_batch_size,
batch_size=effective_batch_size,
beam_width=1,
s=isl + step + 1,
model_name=self._model_name,
gen_seq_imbalance_correction_scale=1.0,
)
step_latency += float(result)
repeat_count = min(DEFAULT_STATIC_STRIDE, osl - 1 - step)
total_latency += step_latency * repeat_count
return total_latency
def predict_prefill(
self, batch_size: int, isl: int, prefix: int, osl: int
) -> float:
"""Predict prefill latency in ms. Parameters match AIC RuntimeConfig."""
# AIC requires at least 1 new token (isl > prefix)
actual_prefix = min(prefix, isl - 1) if isl > 0 else 0
return self._predict_context_latency(batch_size, isl, actual_prefix)
def predict_decode(self, batch_size: int, isl: int, osl: int) -> float:
"""Predict decode (generation) latency in ms."""
return self._predict_generation_latency(batch_size, isl, osl)
def create_session(
backend_name: str,
system: str,
model_path: str,
tp_size: int,
backend_version: str | None = None,
moe_tp_size: int | None = None,
moe_ep_size: int | None = None,
attention_dp_size: int | None = None,
) -> AicSession:
"""Factory function called from Rust via PyO3."""
return AicSession(
backend_name,
system,
model_path,
tp_size,
backend_version,
moe_tp_size,
moe_ep_size,
attention_dp_size,
)
__all__ = ["AicSession", "create_session"]
......@@ -18,8 +18,12 @@ from typing import Optional
import uvloop
from dynamo.llm import KvRouter, KvRouterConfig
from dynamo.router.args import DynamoRouterConfig, build_kv_router_config
from dynamo.llm import AicPerfConfig, KvRouter, KvRouterConfig
from dynamo.router.args import (
DynamoRouterConfig,
build_aic_perf_config,
build_kv_router_config,
)
from dynamo.router.args import parse_args as parse_router_args
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -37,11 +41,13 @@ class StandaloneRouterHandler:
worker_endpoint_path: str,
block_size: int,
kv_router_config: KvRouterConfig,
aic_perf_config: Optional[AicPerfConfig],
):
self.runtime = runtime
self.worker_endpoint_path = worker_endpoint_path
self.block_size = block_size
self.kv_router_config = kv_router_config
self.aic_perf_config = aic_perf_config
self.kv_router: Optional[KvRouter] = None
self.worker_client: Optional[Client] = None
......@@ -67,6 +73,7 @@ class StandaloneRouterHandler:
endpoint=worker_endpoint,
block_size=self.block_size,
kv_router_config=self.kv_router_config,
aic_perf_config=self.aic_perf_config,
)
except Exception as e:
......@@ -178,10 +185,15 @@ async def worker(runtime: DistributedRuntime):
)
kv_router_config = build_kv_router_config(config)
aic_perf_config = build_aic_perf_config(config)
# Create handler
handler = StandaloneRouterHandler(
runtime, config.endpoint, config.router_block_size, kv_router_config
runtime,
config.endpoint,
config.router_block_size,
kv_router_config,
aic_perf_config,
)
await handler.initialize()
......
......@@ -7,15 +7,19 @@ import argparse
from typing import Optional
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.groups.aic_perf_args import (
AicPerfArgGroup,
AicPerfConfigBase,
)
from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import add_argument
from dynamo.llm import KvRouterConfig
from dynamo.llm import AicPerfConfig, KvRouterConfig
class DynamoRouterConfig(KvRouterConfigBase):
class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
"""Typed configuration for the standalone KV router (router-owned options only)."""
namespace: str
......@@ -36,6 +40,25 @@ class DynamoRouterConfig(KvRouterConfigBase):
"Expected format: namespace.component.endpoint"
)
self.namespace = parts[0]
if self.router_prefill_load_model == "aic":
missing = [
flag
for flag, value in (
("--aic-backend", self.aic_backend),
("--aic-system", self.aic_system),
("--aic-model-path", self.aic_model_path),
)
if not value
]
if missing:
raise ValueError(
"--router-prefill-load-model=aic requires " + ", ".join(missing)
)
if not self.router_track_prefill_tokens:
raise ValueError(
"--router-prefill-load-model=aic requires "
"--router-track-prefill-tokens"
)
class DynamoRouterArgGroup(ArgGroup):
......@@ -68,6 +91,7 @@ class DynamoRouterArgGroup(ArgGroup):
# KV router options (shared with dynamo.frontend)
KvRouterArgGroup().add_arguments(parser)
AicPerfArgGroup().add_arguments(parser)
def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
......@@ -75,6 +99,14 @@ def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
return KvRouterConfig(**router_config.kv_router_kwargs())
def build_aic_perf_config(
router_config: DynamoRouterConfig,
) -> AicPerfConfig | None:
if router_config.router_prefill_load_model != "aic":
return None
return AicPerfConfig(**router_config.aic_perf_kwargs())
def parse_args(argv: Optional[list[str]] = None) -> DynamoRouterConfig:
"""Parse command-line arguments for the standalone router.
......
......@@ -392,6 +392,13 @@ For this A/B comparison, we use the [**Mooncake FAST'25 Toolagent Trace**](https
These two requests share blocks 46–57 (12 blocks × 512 tokens = ~6,144 tokens of shared prefix) — a tool agent continuing the same session with accumulated context. Each hash ID represents a **512-token block**, and the hash includes both the current block and all preceding blocks, preserving the pattern of prefix reuse while protecting user privacy. The **KV Smart Router** routes requests with matching hash IDs to the same worker, maximizing cache hits.
If you reproduce this benchmark with `python -m dynamo.replay`, keep that dataset fact separate from
the replay engine configuration:
- use `--trace-block-size 512` for the Mooncake/toolagent trace itself
- keep engine `block_size` in `--extra-engine-args` aligned with the runtime you want to mimic
(for the published vLLM deployment, that is typically `64`)
**Key Dataset Properties:**
-**Realistic timing:** Request arrival patterns from production tool-agent workloads
-**High prefix overlap:** 59% cache ratio ([Mooncake FAST'25 paper](https://github.com/kvcache-ai/Mooncake/blob/main/FAST25-release/Mooncake-FAST25.pdf)); iterative tool calls within sessions produce natural prefix reuse
......
......@@ -28,7 +28,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--num-workers 4 \
--replay-mode offline \
--router-mode round_robin \
--extra-engine-args '{"block_size":512}' \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--report-json /tmp/replay-report.json
```
......@@ -99,13 +100,17 @@ Example:
{"session_id":"session-b","delay_ms":50,"input_length":1536,"output_length":64,"hash_ids":[9,10,11]}
```
The mocker synthesizes token blocks from `hash_ids` using the configured mocker `block_size`, so the
replay block size must match the block size used when the trace was generated. Public Mooncake
traces are commonly block-level hashes at `512` tokens per hash ID, so replaying them with the
default mocker `block_size=64` will fail once `input_length > len(hash_ids) * 64`. Set that
through `--extra-engine-args '{"block_size":512}'`. For `engine_type=sglang`, replay still uses
canonical `block_size` internally; `sglang.page_size` is accepted as a compatibility alias and is
normalized into `block_size` before replay starts.
Replay uses two different block-size concepts for trace files:
- `--trace-block-size`: how many tokens each `hash_id` in the dataset represents
- engine `block_size`: the block size used by the replay engine and router when they re-chunk the
synthesized tokens into sequence hashes
Public Mooncake/toolagent traces use `512` tokens per `hash_id`, so replaying them should normally
use `--trace-block-size 512`. The engine `block_size` can still be smaller, for example the live
vLLM benchmark setup uses `block_size=64`. For `engine_type=sglang`, replay still uses canonical
`block_size` internally; `sglang.page_size` is accepted as a compatibility alias and is normalized
into `block_size` before replay starts.
## Replay Surfaces
......@@ -122,6 +127,7 @@ The dedicated replay CLI exposes:
- `--replay-concurrency`
- `--arrival-interval-ms`
- `--arrival-speedup-ratio`
- `--trace-block-size`
- `--turns-per-session`
- `--shared-prefix-ratio`
- `--num-prefix-groups`
......@@ -130,6 +136,11 @@ The dedicated replay CLI exposes:
- `--prefill-engine-args` (JSON string)
- `--decode-engine-args` (JSON string)
- `--router-config` (JSON string)
- `--aic-backend`
- `--aic-system`
- `--aic-backend-version`
- `--aic-tp-size`
- `--aic-model-path`
- `--report-json`
Defaults:
......@@ -145,7 +156,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--router-mode kv_router \
--num-workers 4 \
--arrival-speedup-ratio 10 \
--extra-engine-args '{"block_size":512}' \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_queue_policy":"fcfs","router_temperature":0.0}' \
--report-json /tmp/replay-report.json
```
......@@ -165,8 +177,15 @@ SGLang replay uses the same CLI surface. A minimal extra-engine-args file can us
Both `--extra-engine-args` and `--router-config` accept partial JSON objects. Engine settings such
as `block_size`, `engine_type`, `dp_size`, `speedup_ratio`, and `decode_speedup_ratio` belong in
`--extra-engine-args`, not as top-level replay CLI flags. Unspecified fields fall back to the same
defaults used by `MockEngineArgs::default()` and `KvRouterConfig::default()`.
`--extra-engine-args`, not as top-level replay CLI flags. `--trace-block-size` is separate and is
used only for trace-file replay. Unspecified fields fall back to the same defaults used by
`MockEngineArgs::default()` and `KvRouterConfig::default()`.
Replay has two independent AIC surfaces:
- engine timing AIC via `--extra-engine-args` / staged engine JSON
- router-side prompt-load AIC via top-level `--aic-*` flags together with
`router_prefill_load_model: "aic"` in `--router-config`
Offline disagg replay uses staged engine args instead of `--extra-engine-args`:
......@@ -179,7 +198,8 @@ For offline disagg replay, the staged JSON must set `worker_type` explicitly:
- `--prefill-engine-args` must use `worker_type: "prefill"`
- `--decode-engine-args` must use `worker_type: "decode"`
The staged configs must also use the same `block_size`.
The staged configs must also use the same engine `block_size`. `--trace-block-size` remains a
separate trace-file input knob.
### Synthetic Replay
......@@ -223,7 +243,8 @@ those timestamps:
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--num-workers 4 \
--extra-engine-args '{"block_size":512}'
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}'
```
This is the right mode when you want deterministic replay of the original arrival pattern.
......@@ -261,7 +282,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--router-mode kv_router \
--num-workers 4 \
--arrival-speedup-ratio 10 \
--extra-engine-args '{"block_size":512}'
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}'
```
### Arrival Speedup
......@@ -274,7 +296,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--num-workers 4 \
--arrival-speedup-ratio 5 \
--extra-engine-args '{"block_size":512}'
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}'
```
### Router Modes
......@@ -292,6 +315,8 @@ provided through `--router-config`, not a dedicated top-level replay flag. In of
- KV visibility is delayed slightly relative to request lifecycle events
- queue admission is driven by router lifecycle edges (`add_request`, `mark_prefill_completed`, and `free`)
- transient in-pass prefill occupancy is still approximated at the router level rather than modeled exactly
- when `router_prefill_load_model` is `"aic"`, replay predicts one expected prefill duration per
admitted request and decays only the oldest active prefill request on each worker
To compare queue policies manually, keep the same trace and engine args fixed and swap only
`router_queue_policy` inside `--router-config`:
......@@ -301,20 +326,41 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--num-workers 4 \
--extra-engine-args '{"block_size":512}' \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_queue_policy":"fcfs"}'
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--num-workers 4 \
--extra-engine-args '{"block_size":512}' \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_queue_policy":"lcfs"}'
```
`lcfs` is intentionally a worse comparison policy under saturation; use it for experiments, not as
an expected production default.
To enable router-side AIC prefill-load modeling during replay:
```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--num-workers 4 \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_track_prefill_tokens":true,"router_prefill_load_model":"aic"}' \
--aic-backend vllm \
--aic-system h200_sxm \
--aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
--aic-tp-size 1
```
For offline disagg replay, the same top-level `--aic-*` flags are supported, but the estimator is
applied only to the prefill-stage router.
## Output
The report contains:
......@@ -366,14 +412,18 @@ If you violate those constraints, replay fails immediately with a validation err
- mocker compute-speed knobs such as `speedup_ratio` still affect simulated timing when passed via
the engine-args JSON for the chosen replay mode
- `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
- `--trace-block-size` affects only how trace `hash_ids` expand into tokens
- `--arrival-interval-ms` only applies to synthetic replay
- `--turns-per-session`, `--shared-prefix-ratio`, `--num-prefix-groups`, and
`--inter-turn-delay-ms` only apply to synthetic replay
- `--extra-engine-args`, `--prefill-engine-args`, `--decode-engine-args`, and `--router-config`
are JSON strings on the standalone replay CLI
- top-level `--aic-*` flags are used only for router-side prompt-load modeling; engine timing AIC
still belongs in the engine-args JSON
- offline replay does not need planner runtime setup, router registration, or external event transport
- the replay block size should match the trace block size, because token synthesis expands `hash_ids`
using the configured block size
- trace-file replay can use different values for `--trace-block-size` and engine `block_size`
- Mooncake/toolagent traces typically use `--trace-block-size 512`, while engine `block_size`
often stays `64`
## When To Use This vs AIPerf
......
......@@ -42,11 +42,27 @@ The Rust HTTP server also reads these environment variables (not exposed as CLI
| `--router-track-active-blocks` / `--no-router-track-active-blocks` | `DYN_ROUTER_TRACK_ACTIVE_BLOCKS` | `true` | Track blocks used by in-progress requests for load balancing |
| `--router-assume-kv-reuse` / `--no-router-assume-kv-reuse` | `DYN_ROUTER_ASSUME_KV_REUSE` | `true` | Assume KV cache reuse when tracking active blocks |
| `--router-track-output-blocks` / `--no-router-track-output-blocks` | `DYN_ROUTER_TRACK_OUTPUT_BLOCKS` | `false` | Track output blocks with fractional decay during generation |
| `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens` | `DYN_ROUTER_TRACK_PREFILL_TOKENS` | `true` | Track prompt-side prefill load in worker load accounting |
| `--router-prefill-load-model` | `DYN_ROUTER_PREFILL_LOAD_MODEL` | `none` | Prompt-side load model: `none` for static load, `aic` for oldest-prefill decay using an AIC prediction |
| `--router-event-threads` | `DYN_ROUTER_EVENT_THREADS` | `4` | Event processing threads. >1 enables concurrent radix tree |
| `--router-queue-threshold` | `DYN_ROUTER_QUEUE_THRESHOLD` | `4.0` | Queue threshold fraction of prefill capacity. Enables priority scheduling |
| `--router-queue-policy` | `DYN_ROUTER_QUEUE_POLICY` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--decode-fallback` / `--no-decode-fallback` | `DYN_DECODE_FALLBACK` | `false` | Fall back to aggregated mode when prefill workers unavailable |
## AIC Prefill Load Model
These options are used only when `--router-mode kv` and `--router-prefill-load-model aic` are enabled.
| CLI Argument | Env Var | Default | Description |
|-------------|---------|---------|-------------|
| `--aic-backend` | `DYN_AIC_BACKEND` | — | Backend family to model in AIC, for example `vllm` or `sglang` |
| `--aic-system` | `DYN_AIC_SYSTEM` | — | AIC hardware/system identifier, for example `h200_sxm` |
| `--aic-model-path` | `DYN_AIC_MODEL_PATH` | — | Model path or model identifier used for AIC perf lookup |
| `--aic-backend-version` | `DYN_AIC_BACKEND_VERSION` | backend-specific | Pinned AIC database version. If omitted, Dynamo uses the backend default |
| `--aic-tp-size` | `DYN_AIC_TP_SIZE` | `1` | Tensor-parallel size to model in AIC |
When enabled, the frontend's embedded KV router predicts one expected prefill duration per admitted request, using the selected worker's overlap-derived cached prefix. The router then decays only the oldest active prefill request on each worker for prompt-side load accounting.
## Fault Tolerance
| CLI Argument | Env Var | Default | Description |
......
......@@ -89,6 +89,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens` | `--router-track-prefill-tokens` | Include prompt-side load in active worker load accounting |
| `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration |
| `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
......@@ -96,6 +98,52 @@ For all available options: `python -m dynamo.frontend --help`
For detailed configuration options and tuning parameters, see [Advanced Router Usage](#advanced-router-usage).
#### AIC Prefill Load Model
The KV router can use AIC to estimate the expected duration of the selected worker's prompt-side prefill work. When enabled, the router:
- computes `prefix = overlap_blocks * block_size` for the chosen worker
- computes `effective_isl = input_tokens - prefix`
- stores one prompt-load hint for the admitted request
- decays only the **oldest** active prefill request on each worker over time
This affects router-side prompt load accounting only. It does not change backend execution or decode-side accounting.
Enable it on the frontend like this:
```bash
python -m dynamo.frontend \
--router-mode kv \
--router-prefill-load-model aic \
--aic-backend vllm \
--aic-system h200_sxm \
--aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8
```
The standalone router uses the same AIC flags:
```bash
python -m dynamo.router \
--endpoint dynamo.prefill.generate \
--router-prefill-load-model aic \
--aic-backend vllm \
--aic-system h200_sxm \
--aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8
```
Required when `--router-prefill-load-model=aic` is enabled:
- `--router-mode kv` on the frontend
- `--router-track-prefill-tokens`
- `--aic-backend`
- `--aic-system`
- `--aic-model-path`
Optional AIC knobs:
- `--aic-backend-version`: pinned AIC database version; if omitted, Dynamo uses a backend-specific default
- `--aic-tp-size`: tensor-parallel size for the modeled backend; defaults to `1`
### Kubernetes Deployment
To enable the KV Router in Kubernetes, add the `DYN_ROUTER_MODE` environment variable to your frontend service:
......@@ -235,6 +283,10 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--router-track-prefill-tokens`: Enables prompt-side load accounting in the worker cost model. This should stay enabled if you want queue thresholds, `active_prefill_tokens`, and AIC prefill load decay to reflect prompt work.
- `--router-prefill-load-model`: Selects the router's prompt-side load model. `none` keeps the existing static prompt load accounting. `aic` predicts one expected prefill duration per admitted request and lazily decays only the oldest active prefill request on each worker.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 4.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).
- `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Three policies are available:
......@@ -292,6 +344,8 @@ Use `--router-track-output-blocks` **(experimental)** when your workload is outp
The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
Use `--router-prefill-load-model aic` **(experimental)** when you want prompt-side load tracking to decay the oldest active prefill request using an AIC-predicted duration instead of keeping prompt load static until first token. This requires `--router-track-prefill-tokens` and the shared `--aic-*` config (`--aic-backend`, `--aic-system`, and `--aic-model-path`; `--aic-tp-size` defaults to `1`, and `--aic-backend-version` is optional). This path is still experimental because the decay model is based on expected prefill duration rather than observed worker-side progress.
Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time.
### Prometheus Metrics
......
......@@ -139,7 +139,8 @@ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--arrival-speedup-ratio 5 \
--extra-engine-args '{"block_size":512}' \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_queue_policy":"fcfs"}' \
--report-json /tmp/replay-report.json
```
......@@ -184,6 +185,11 @@ first turn uses `timestamp`/`created_time`; later turns can use `delay` or `dela
{"session_id":"session-a","delay":250,"input_length":2560,"output_length":128,"hash_ids":[1,2,3,4,5]}
```
For trace-file replay, `--trace-block-size` controls how many tokens each `hash_id` represents in
the dataset, while engine `block_size` still controls the replay engine and router hashing. Public
Mooncake/toolagent traces use `--trace-block-size 512`; engine `block_size` can still stay at `64`
to match the live runtime configuration.
The standalone replay CLI prints an AIPerf-style summary table to stdout and writes the full replay
report JSON to disk.
......@@ -264,12 +270,19 @@ The AIC model automatically uses `--model-path` and `--engine-type` to select th
Important notes:
- AIC is opt-in. If you do not pass `--aic-perf-model`, `python -m dynamo.mocker` does not use AIC.
- `python -m dynamo.replay` also does not use AIC unless you explicitly put AIC fields in the engine-args JSON.
- `python -m dynamo.replay` has two separate AIC surfaces:
- engine timing AIC through `--extra-engine-args` / staged engine JSON
- router-side prefill-load AIC through top-level `--aic-*` flags plus `router_prefill_load_model="aic"` in `--router-config`
- The Python AIC session bridge is now shared with the live KV router path via the internal `dynamo._internal.aic` module. Mocker CLI behavior is unchanged; this just removes duplicate AIC session code.
- `aiconfigurator` must be able to load the requested performance database for the selected `system/backend/version`. If the SDK is installed but the backing systems data is missing or unreadable, mocker now fails fast at startup with a clear error instead of failing later on first request.
- In development environments, this may require pointing Python at a source checkout of `aiconfigurator` with real Git LFS payloads materialized in its `systems/` directory.
When using `python -m dynamo.replay`, there are no dedicated AIC flags. For aggregated replay,
pass the equivalent fields via `--extra-engine-args`:
This mocker AIC path is separate from the router-side prefill-load estimator. Live router,
frontend, and replay all use `router_prefill_load_model="aic"` plus top-level `--aic-*` flags for
oldest-prefill prompt-load decay. Replay still uses engine-args AIC separately when you want the
mocked worker timing model itself to come from AIC.
For aggregated replay, engine timing AIC still comes from `--extra-engine-args`:
```bash
python -m dynamo.replay /path/to/trace.jsonl \
......@@ -290,6 +303,25 @@ python -m dynamo.replay /path/to/trace.jsonl \
The `aic_backend` field enables the AIC perf model and should match `engine_type` (`"vllm"` or `"sglang"`). The `aic_model_path` field is the equivalent of `--model-path` in `dynamo.mocker`.
Replay router-side AIC prompt-load modeling is configured separately with top-level flags:
```bash
python -m dynamo.replay /path/to/trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--num-workers 4 \
--trace-block-size 512 \
--extra-engine-args '{"block_size":64}' \
--router-config '{"router_track_prefill_tokens":true,"router_prefill_load_model":"aic"}' \
--aic-backend vllm \
--aic-system h200_sxm \
--aic-model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
--aic-tp-size 1
```
For offline disagg replay, the same top-level `--aic-*` flags drive the prefill-stage router only;
the decode-stage router keeps prompt tracking disabled.
Example `--reasoning` configuration:
```bash
......
......@@ -366,6 +366,7 @@ async fn apply_entry(
worker: WorkerWithDpRank,
entry: SequenceTraceEntry,
) {
let decay_now = tokio::time::Instant::now();
match entry {
SequenceTraceEntry::Add {
request_id,
......@@ -377,23 +378,28 @@ async fn apply_entry(
Some(&block_hashes),
isl,
OverlapScores::default(),
decay_now,
);
let _ = multi.add_request(SequenceRequest {
let _ = multi.add_request(
SequenceRequest {
request_id,
token_sequence: Some(block_hashes),
isl,
overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: Some(output_length as u32),
prefill_load_hint: None,
worker,
lora_name: None,
});
},
decay_now,
);
}
SequenceTraceEntry::PrefillComplete { request_id } => {
let _ = multi.mark_prefill_completed(&request_id);
let _ = multi.mark_prefill_completed(&request_id, decay_now);
}
SequenceTraceEntry::Free { request_id } => {
let _ = multi.free(&request_id);
let _ = multi.free(&request_id, decay_now);
}
}
}
......
......@@ -49,8 +49,12 @@ struct Args {
#[arg(long, default_value_t = 4.0)]
arrival_speedup_ratio: f64,
/// Mocker block size; defaults to 512 for Mooncake traces
/// Trace hash block size used to expand hash_ids into tokens
#[arg(long, default_value_t = 512)]
trace_block_size: usize,
/// Engine/router block size used for replay hashing and mock execution
#[arg(long, default_value_t = 64)]
block_size: usize,
/// Override max running requests per worker
......@@ -115,7 +119,9 @@ fn main() -> Result<()> {
last_report = Some(simulate_trace_file_with_router_mode(
engine_args.clone(),
None,
None,
&args.trace_file,
args.trace_block_size,
args.num_workers,
args.arrival_speedup_ratio,
args.router_mode.into(),
......
......@@ -30,7 +30,6 @@ crate-type = ["cdylib", "staticlib"]
cbindgen = "0.27"
[dependencies]
dynamo-llm = { path = "../../llm" }
dynamo-kv-router = { path = "../../kv-router" }
dynamo-runtime = { path = "../../runtime" }
......@@ -43,3 +42,9 @@ async-once-cell = { version = "0.5.4" }
libc = { version = "0.2" }
once_cell = { version = "1" }
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
[target.'cfg(target_os = "linux")'.dependencies]
dynamo-llm = { path = "../../llm" }
[target.'cfg(not(target_os = "linux"))'.dependencies]
dynamo-llm = { path = "../../llm", default-features = false }
......@@ -716,6 +716,7 @@ pub unsafe extern "C" fn create_routers(
&endpoint,
block_size,
Some(kv_router_config.clone()),
None,
WORKER_TYPE_DECODE,
Some(model_name.clone()),
enable_eagle,
......@@ -781,6 +782,7 @@ pub unsafe extern "C" fn create_routers(
RouterMode::KV,
block_size,
Some(prefill_config),
None,
enforce_disagg,
model_name.clone(),
actual_namespace.clone(),
......
......@@ -131,9 +131,9 @@ dependencies = [
[[package]]
name = "arc-swap"
version = "1.9.0"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6"
checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
dependencies = [
"rustversion",
]
......@@ -920,6 +920,18 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "console"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
dependencies = [
"encode_unicode",
"libc",
"unicode-width",
"windows-sys 0.61.2",
]
[[package]]
name = "const-oid"
version = "0.9.6"
......@@ -1410,7 +1422,7 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
dependencies = [
"console",
"console 0.15.11",
"shell-words",
"tempfile",
"thiserror 1.0.69",
......@@ -1655,6 +1667,7 @@ dependencies = [
"derive_builder",
"dynamo-kv-router",
"dynamo-tokens",
"indicatif 0.18.4",
"ndarray",
"ndarray-interp",
"ndarray-npy",
......@@ -2037,11 +2050,11 @@ dependencies = [
[[package]]
name = "fastrand"
version = "2.3.0"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
dependencies = [
"getrandom 0.2.17",
"getrandom 0.3.4",
]
[[package]]
......@@ -2489,7 +2502,7 @@ dependencies = [
"dirs",
"futures",
"http",
"indicatif",
"indicatif 0.17.11",
"libc",
"log",
"num_cpus",
......@@ -2865,13 +2878,26 @@ version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
dependencies = [
"console",
"console 0.15.11",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
[[package]]
name = "indicatif"
version = "0.18.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
dependencies = [
"console 0.16.3",
"portable-atomic",
"unicode-width",
"unit-prefix",
"web-time",
]
[[package]]
name = "indoc"
version = "2.0.7"
......@@ -7239,6 +7265,12 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
[[package]]
name = "unit-prefix"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
......
......@@ -938,6 +938,18 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "console"
version = "0.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87"
dependencies = [
"encode_unicode",
"libc",
"unicode-width",
"windows-sys 0.61.2",
]
[[package]]
name = "const-oid"
version = "0.9.6"
......@@ -1428,7 +1440,7 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "658bce805d770f407bc62102fca7c2c64ceef2fbcb2b8bd19d2765ce093980de"
dependencies = [
"console",
"console 0.15.11",
"shell-words",
"tempfile",
"thiserror 1.0.69",
......@@ -1670,6 +1682,7 @@ dependencies = [
"derive_builder",
"dynamo-kv-router",
"dynamo-tokens",
"indicatif 0.18.4",
"ndarray",
"ndarray-interp",
"ndarray-npy",
......@@ -2561,7 +2574,7 @@ dependencies = [
"dirs",
"futures",
"http",
"indicatif",
"indicatif 0.17.11",
"libc",
"log",
"num_cpus",
......@@ -2937,13 +2950,26 @@ version = "0.17.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
dependencies = [
"console",
"console 0.15.11",
"number_prefix",
"portable-atomic",
"unicode-width",
"web-time",
]
[[package]]
name = "indicatif"
version = "0.18.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
dependencies = [
"console 0.16.3",
"portable-atomic",
"unicode-width",
"unit-prefix",
"web-time",
]
[[package]]
name = "indoc"
version = "2.0.7"
......@@ -7309,6 +7335,12 @@ version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
[[package]]
name = "unit-prefix"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
[[package]]
name = "unsafe-libyaml"
version = "0.2.11"
......
......@@ -169,6 +169,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<llm::entrypoint::EntrypointArgs>()?;
m.add_class::<llm::entrypoint::EngineConfig>()?;
m.add_class::<llm::entrypoint::EngineType>()?;
m.add_class::<llm::entrypoint::AicPerfConfig>()?;
m.add_class::<llm::entrypoint::RouterConfig>()?;
m.add_class::<llm::entrypoint::KvRouterConfig>()?;
m.add_class::<llm::replay::ReasoningConfig>()?;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment