Unverified Commit 02b1c58a authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add offline disagg replay (#7617)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 4b8826b3
...@@ -26,6 +26,7 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = ( ...@@ -26,6 +26,7 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
"router_track_active_blocks", "router_track_active_blocks",
"router_track_output_blocks", "router_track_output_blocks",
"router_assume_kv_reuse", "router_assume_kv_reuse",
"router_track_prefill_tokens",
"router_snapshot_threshold", "router_snapshot_threshold",
"router_reset_states", "router_reset_states",
"router_ttl_secs", "router_ttl_secs",
...@@ -51,6 +52,7 @@ class KvRouterConfigBase(ConfigBase): ...@@ -51,6 +52,7 @@ class KvRouterConfigBase(ConfigBase):
router_track_active_blocks: bool router_track_active_blocks: bool
router_track_output_blocks: bool router_track_output_blocks: bool
router_assume_kv_reuse: bool router_assume_kv_reuse: bool
router_track_prefill_tokens: bool
router_snapshot_threshold: int router_snapshot_threshold: int
router_reset_states: bool router_reset_states: bool
router_ttl_secs: float router_ttl_secs: float
...@@ -173,6 +175,18 @@ class KvRouterArgGroup(ArgGroup): ...@@ -173,6 +175,18 @@ class KvRouterArgGroup(ArgGroup):
), ),
obsolete_flag="--assume-kv-reuse", obsolete_flag="--assume-kv-reuse",
) )
add_negatable_bool_argument(
g,
flag_name="--router-track-prefill-tokens",
env_var="DYN_ROUTER_TRACK_PREFILL_TOKENS",
default=True,
dest="router_track_prefill_tokens",
help=(
"KV Router: Include prompt-side prefill tokens in active load accounting. "
"Use --no-router-track-prefill-tokens to ignore prompt tokens in router "
"prefill-token load, queue pressure, and active_prefill_tokens metrics."
),
)
add_argument( add_argument(
g, g,
flag_name="--router-snapshot-threshold", flag_name="--router-snapshot-threshold",
......
...@@ -14,7 +14,7 @@ from aiconfigurator.sdk import config ...@@ -14,7 +14,7 @@ from aiconfigurator.sdk import config
from aiconfigurator.sdk.backends.factory import get_backend from aiconfigurator.sdk.backends.factory import get_backend
from aiconfigurator.sdk.inference_session import InferenceSession from aiconfigurator.sdk.inference_session import InferenceSession
from aiconfigurator.sdk.models import get_model from aiconfigurator.sdk.models import get_model
from aiconfigurator.sdk.perf_database import get_database from aiconfigurator.sdk.perf_database import get_database, get_supported_databases
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -40,6 +40,14 @@ class AicSession: ...@@ -40,6 +40,14 @@ class AicSession:
) )
database = get_database(system=system, backend=backend_name, version=version) database = get_database(system=system, backend=backend_name, version=version)
if database is None:
supported = get_supported_databases().get(system, {}).get(backend_name, [])
supported_versions = ", ".join(supported) if supported else "<none>"
raise RuntimeError(
"AIC perf database not found for "
f"system={system!r}, backend={backend_name!r}, version={version!r}. "
f"Supported versions for this system/backend: {supported_versions}"
)
model_config = config.ModelConfig(tp_size=tp_size) model_config = config.ModelConfig(tp_size=tp_size)
model = get_model( model = get_model(
model_path=model_path, model_path=model_path,
......
...@@ -42,6 +42,13 @@ def _build_sglang_args(args: argparse.Namespace) -> SglangArgs | None: ...@@ -42,6 +42,13 @@ def _build_sglang_args(args: argparse.Namespace) -> SglangArgs | None:
def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs: def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
worker_type = (
"prefill"
if getattr(args, "is_prefill_worker", False)
else "decode"
if getattr(args, "is_decode_worker", False)
else "aggregated"
)
aic_backend = None aic_backend = None
aic_system = None aic_system = None
aic_backend_version = None aic_backend_version = None
...@@ -53,7 +60,6 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs: ...@@ -53,7 +60,6 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
aic_backend_version = getattr(args, "aic_backend_version", None) aic_backend_version = getattr(args, "aic_backend_version", None)
aic_tp_size = getattr(args, "aic_tp_size", None) aic_tp_size = getattr(args, "aic_tp_size", None)
aic_model_path = getattr(args, "model_path", None) aic_model_path = getattr(args, "model_path", None)
return MockEngineArgs( return MockEngineArgs(
engine_type=getattr(args, "engine_type", None) or "vllm", engine_type=getattr(args, "engine_type", None) or "vllm",
num_gpu_blocks=getattr(args, "num_gpu_blocks", _DEFAULT_NUM_GPU_BLOCKS), num_gpu_blocks=getattr(args, "num_gpu_blocks", _DEFAULT_NUM_GPU_BLOCKS),
...@@ -64,18 +70,12 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs: ...@@ -64,18 +70,12 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
), ),
enable_prefix_caching=getattr(args, "enable_prefix_caching", True), enable_prefix_caching=getattr(args, "enable_prefix_caching", True),
enable_chunked_prefill=getattr(args, "enable_chunked_prefill", True), enable_chunked_prefill=getattr(args, "enable_chunked_prefill", True),
preemption_mode=getattr(args, "preemption_mode", "lifo"),
speedup_ratio=getattr(args, "speedup_ratio", 1.0), speedup_ratio=getattr(args, "speedup_ratio", 1.0),
decode_speedup_ratio=getattr(args, "decode_speedup_ratio", 1.0), decode_speedup_ratio=getattr(args, "decode_speedup_ratio", 1.0),
dp_size=getattr(args, "dp_size", 1), dp_size=getattr(args, "dp_size", 1),
startup_time=getattr(args, "startup_time", None), startup_time=getattr(args, "startup_time", None),
worker_type=( worker_type=worker_type,
"prefill" planner_profile_data=getattr(args, "planner_profile_data", None),
if getattr(args, "is_prefill_worker", False)
else "decode"
if getattr(args, "is_decode_worker", False)
else "aggregated"
),
aic_backend=aic_backend, aic_backend=aic_backend,
aic_system=aic_system, aic_system=aic_system,
aic_backend_version=aic_backend_version, aic_backend_version=aic_backend_version,
...@@ -85,6 +85,7 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs: ...@@ -85,6 +85,7 @@ def build_mocker_engine_args(args: argparse.Namespace) -> MockEngineArgs:
kv_transfer_bandwidth=getattr(args, "kv_transfer_bandwidth", None), kv_transfer_bandwidth=getattr(args, "kv_transfer_bandwidth", None),
reasoning=_parse_reasoning_config(getattr(args, "reasoning", None)), reasoning=_parse_reasoning_config(getattr(args, "reasoning", None)),
sglang=_build_sglang_args(args), sglang=_build_sglang_args(args),
preemption_mode=getattr(args, "preemption_mode", "lifo"),
) )
......
...@@ -29,7 +29,7 @@ python -m dynamo.router \ ...@@ -29,7 +29,7 @@ python -m dynamo.router \
- `--endpoint`: Full endpoint path for workers in the format `namespace.component.endpoint` (e.g., `dynamo.prefill.generate`) - `--endpoint`: Full endpoint path for workers in the format `namespace.component.endpoint` (e.g., `dynamo.prefill.generate`)
**Router Configuration:** **Router Configuration:**
All router options use the `--router-*` prefix (e.g., `--router-block-size`, `--router-kv-overlap-score-weight`, `--router-temperature`, `--router-kv-events` / `--no-router-kv-events`, `--router-replica-sync`, `--router-snapshot-threshold`, `--router-reset-states`, `--router-track-active-blocks` / `--no-router-track-active-blocks`). Legacy names without the prefix (e.g., `--block-size`, `--kv-events`) are still accepted but deprecated. For detailed descriptions, see the [Router Guide](/docs/components/router/router-guide.md). All router options use the `--router-*` prefix (e.g., `--router-block-size`, `--router-kv-overlap-score-weight`, `--router-temperature`, `--router-kv-events` / `--no-router-kv-events`, `--router-replica-sync`, `--router-snapshot-threshold`, `--router-reset-states`, `--router-track-active-blocks` / `--no-router-track-active-blocks`, `--router-track-prefill-tokens` / `--no-router-track-prefill-tokens`). Legacy names without the prefix (e.g., `--block-size`, `--kv-events`) are still accepted but deprecated. For detailed descriptions, see the [Router Guide](/docs/components/router/router-guide.md).
## Architecture ## Architecture
...@@ -74,6 +74,9 @@ python -m dynamo.vllm --model MODEL_NAME --block-size 64 --disaggregation-mode p ...@@ -74,6 +74,9 @@ python -m dynamo.vllm --model MODEL_NAME --block-size 64 --disaggregation-mode p
> **Why `--no-router-track-active-blocks` for prefill routing?** > **Why `--no-router-track-active-blocks` for prefill routing?**
> Active block tracking is used for load balancing across decode (generation) phases. For prefill-only routing, decode load is not relevant, so disabling this reduces overhead and simplifies the router state. > Active block tracking is used for load balancing across decode (generation) phases. For prefill-only routing, decode load is not relevant, so disabling this reduces overhead and simplifies the router state.
> >
> **When should I use `--no-router-track-prefill-tokens`?**
> Use it on decode-only routers that should ignore already-completed prompt work. This keeps `active_prefill_tokens`, queue pressure, and load estimates focused on decode-side work after a prefill-to-decode handoff.
>
> **Why `--router-block-size` is required for standalone routers:** > **Why `--router-block-size` is required for standalone routers:**
> Unlike the frontend router which can infer block size from the ModelDeploymentCard (MDC) during worker registration, standalone routers cannot access the MDC and must have the block size explicitly specified. This is a work in progress to enable automatic inference. > Unlike the frontend router which can infer block size from the ModelDeploymentCard (MDC) during worker registration, standalone routers cannot access the MDC and must have the block size explicitly specified. This is a work in progress to enable automatic inference.
......
...@@ -171,6 +171,7 @@ async def worker(runtime: DistributedRuntime): ...@@ -171,6 +171,7 @@ async def worker(runtime: DistributedRuntime):
f"router_track_active_blocks={config.router_track_active_blocks}, " f"router_track_active_blocks={config.router_track_active_blocks}, "
f"router_track_output_blocks={config.router_track_output_blocks}, " f"router_track_output_blocks={config.router_track_output_blocks}, "
f"router_assume_kv_reuse={config.router_assume_kv_reuse}, " f"router_assume_kv_reuse={config.router_assume_kv_reuse}, "
f"router_track_prefill_tokens={config.router_track_prefill_tokens}, "
f"router_ttl_secs={config.router_ttl_secs}, " f"router_ttl_secs={config.router_ttl_secs}, "
f"router_max_tree_size={config.router_max_tree_size}, " f"router_max_tree_size={config.router_max_tree_size}, "
f"router_prune_target_ratio={config.router_prune_target_ratio}" f"router_prune_target_ratio={config.router_prune_target_ratio}"
......
...@@ -117,6 +117,8 @@ The dedicated replay CLI exposes: ...@@ -117,6 +117,8 @@ The dedicated replay CLI exposes:
- `--replay-mode offline|online` - `--replay-mode offline|online`
- `--router-mode round_robin|kv_router` - `--router-mode round_robin|kv_router`
- `--num-workers` - `--num-workers`
- `--num-prefill-workers`
- `--num-decode-workers`
- `--replay-concurrency` - `--replay-concurrency`
- `--arrival-interval-ms` - `--arrival-interval-ms`
- `--arrival-speedup-ratio` - `--arrival-speedup-ratio`
...@@ -125,6 +127,8 @@ The dedicated replay CLI exposes: ...@@ -125,6 +127,8 @@ The dedicated replay CLI exposes:
- `--num-prefix-groups` - `--num-prefix-groups`
- `--inter-turn-delay-ms` - `--inter-turn-delay-ms`
- `--extra-engine-args` (JSON string) - `--extra-engine-args` (JSON string)
- `--prefill-engine-args` (JSON string)
- `--decode-engine-args` (JSON string)
- `--router-config` (JSON string) - `--router-config` (JSON string)
- `--report-json` - `--report-json`
...@@ -164,6 +168,19 @@ as `block_size`, `engine_type`, `dp_size`, `speedup_ratio`, and `decode_speedup_ ...@@ -164,6 +168,19 @@ as `block_size`, `engine_type`, `dp_size`, `speedup_ratio`, and `decode_speedup_
`--extra-engine-args`, not as top-level replay CLI flags. Unspecified fields fall back to the same `--extra-engine-args`, not as top-level replay CLI flags. Unspecified fields fall back to the same
defaults used by `MockEngineArgs::default()` and `KvRouterConfig::default()`. defaults used by `MockEngineArgs::default()` and `KvRouterConfig::default()`.
Offline disagg replay uses staged engine args instead of `--extra-engine-args`:
- `--prefill-engine-args` for the prefill worker config
- `--decode-engine-args` for the decode worker config
- `--num-prefill-workers` and `--num-decode-workers` for pool sizes
For offline disagg replay, the staged JSON must set `worker_type` explicitly:
- `--prefill-engine-args` must use `worker_type: "prefill"`
- `--decode-engine-args` must use `worker_type: "decode"`
The staged configs must also use the same `block_size`.
### Synthetic Replay ### Synthetic Replay
Synthetic replay bypasses trace loading and generates in-memory requests with fixed input/output Synthetic replay bypasses trace loading and generates in-memory requests with fixed input/output
...@@ -320,9 +337,12 @@ If `--report-json` is not provided, `python -m dynamo.replay` writes a timestamp ...@@ -320,9 +337,12 @@ If `--report-json` is not provided, `python -m dynamo.replay` writes a timestamp
Shared replay constraints: Shared replay constraints:
- aggregated mode
- `extra_engine_args.engine_type` must be `vllm` or `sglang` - `extra_engine_args.engine_type` must be `vllm` or `sglang`
- `extra_engine_args.dp_size` must be `1` - aggregated replay requires the existing aggregated args path
- disagg replay requires both `prefill_engine_args` and `decode_engine_args`
- disagg replay requires `router_mode=kv_router`
- replay `dp_size` must be `1`
- disagg replay requires matching `block_size` in `prefill_engine_args` and `decode_engine_args`
Additional offline constraints: Additional offline constraints:
...@@ -330,6 +350,7 @@ Additional offline constraints: ...@@ -330,6 +350,7 @@ Additional offline constraints:
- single-worker offline replay is still a dedicated fast path for `vllm`, but it now supports both - single-worker offline replay is still a dedicated fast path for `vllm`, but it now supports both
flat request replay and workload-driven multi-turn replay flat request replay and workload-driven multi-turn replay
- `sglang` still goes through the shared multi-worker replay runtime even when `num_workers=1` - `sglang` still goes through the shared multi-worker replay runtime even when `num_workers=1`
- offline disagg replay is a separate two-stage runtime with prefill and decode worker pools
Additional online constraints: Additional online constraints:
...@@ -343,12 +364,13 @@ If you violate those constraints, replay fails immediately with a validation err ...@@ -343,12 +364,13 @@ If you violate those constraints, replay fails immediately with a validation err
either a trace file, or all of `--input-tokens`, `--output-tokens`, and `--request-count` either a trace file, or all of `--input-tokens`, `--output-tokens`, and `--request-count`
- `--replay-concurrency` works with both trace replay and synthetic replay - `--replay-concurrency` works with both trace replay and synthetic replay
- mocker compute-speed knobs such as `speedup_ratio` still affect simulated timing when passed via - mocker compute-speed knobs such as `speedup_ratio` still affect simulated timing when passed via
`--extra-engine-args` the engine-args JSON for the chosen replay mode
- `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed - `--arrival-speedup-ratio` affects trace timestamps, not worker compute speed
- `--arrival-interval-ms` only applies to synthetic replay - `--arrival-interval-ms` only applies to synthetic replay
- `--turns-per-session`, `--shared-prefix-ratio`, `--num-prefix-groups`, and - `--turns-per-session`, `--shared-prefix-ratio`, `--num-prefix-groups`, and
`--inter-turn-delay-ms` only apply to synthetic replay `--inter-turn-delay-ms` only apply to synthetic replay
- `--extra-engine-args` and `--router-config` are JSON strings on the standalone replay CLI - `--extra-engine-args`, `--prefill-engine-args`, `--decode-engine-args`, and `--router-config`
are JSON strings on the standalone replay CLI
- offline replay does not need planner runtime setup, router registration, or external event transport - offline replay does not need planner runtime setup, router registration, or external event transport
- the replay block size should match the trace block size, because token synthesis expands `hash_ids` - the replay block size should match the trace block size, because token synthesis expands `hash_ids`
using the configured block size using the configured block size
......
...@@ -23,6 +23,7 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa ...@@ -23,6 +23,7 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa
| `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) | | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
| `--router-queue-threshold` | `4.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` | | `--router-queue-threshold` | `4.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
| `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) | | `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--no-router-track-prefill-tokens` | disabled | Ignore prompt-side prefill tokens in router load accounting; useful for decode-only routing paths |
### Standalone Router ### Standalone Router
......
...@@ -256,6 +256,8 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na ...@@ -256,6 +256,8 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
- `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist. - `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
- `--no-router-track-prefill-tokens`: Disables prompt-side prefill token accounting in the router's active load model. By default (`router_track_prefill_tokens=true`), the router counts uncached prompt tokens toward `active_prefill_tokens`, queue pressure, and potential prefill-token load. Disable this for decode-only routing paths where prompt processing has already happened elsewhere and the decode router should ignore transferred prompt load. In normal live disaggregated serving, the decode-stage override applies this behavior automatically.
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly. - `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
### KV Indexer / Approx KV Indexer ### KV Indexer / Approx KV Indexer
...@@ -280,6 +282,8 @@ Use `--no-router-kv-events` when you are not confident that your backend engine ...@@ -280,6 +282,8 @@ Use `--no-router-kv-events` when you are not confident that your backend engine
Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worker does not reuse transferred KV cache blocks. By default the router assumes KV blocks transferred from prefill to decode will be deduplicated on the decode side, but vLLM and SGLang decode workers currently do not support this — only TensorRT-LLM does. Without this flag, the router undercounts decode blocks when duplicates exist, leading to inaccurate load estimates. Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worker does not reuse transferred KV cache blocks. By default the router assumes KV blocks transferred from prefill to decode will be deduplicated on the decode side, but vLLM and SGLang decode workers currently do not support this — only TensorRT-LLM does. Without this flag, the router undercounts decode blocks when duplicates exist, leading to inaccurate load estimates.
Use `--no-router-track-prefill-tokens` when a router is serving decode-only traffic and prompt processing has already completed elsewhere. This keeps decode routing decisions focused on decode-side load instead of briefly charging prompt tokens to the decode worker after handoff. The built-in live disaggregated decode path applies the equivalent per-request override automatically.
Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default). Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default).
The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely. The `--router-queue-threshold` (default: 4.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
...@@ -310,6 +314,11 @@ The prefill router is automatically created when: ...@@ -310,6 +314,11 @@ The prefill router is automatically created when:
- **Seamlessly integrated** into the request pipeline between preprocessing and decode routing - **Seamlessly integrated** into the request pipeline between preprocessing and decode routing
- **Falls back gracefully** to decode-only mode if prefill fails or no prefill workers are available - **Falls back gracefully** to decode-only mode if prefill fails or no prefill workers are available
**Key characteristics of the decode routing stage in disaggregated mode:**
- **Disables overlap scoring** (`overlap_score_weight=0`) because decode routing should not chase prefix reuse
- **Disables KV reuse assumption** (`assume_kv_reuse=false`) unless the backend can truly deduplicate transferred blocks
- **Disables prefill-token tracking** (`track_prefill_tokens=false`) so decode-side load reflects decode work rather than already-completed prompt work
### Setup Example ### Setup Example
When both workers are registered, requests are automatically routed. When both workers are registered, requests are automatically routed.
......
...@@ -96,7 +96,7 @@ python -m dynamo.mocker \ ...@@ -96,7 +96,7 @@ python -m dynamo.mocker \
| `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size | | `--sglang-chunked-prefill-size` | 8192 | SGLang chunked-prefill chunk size |
| `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens | | `--sglang-clip-max-new-tokens` | 4096 | SGLang admission-budget cap for max new tokens |
| `--sglang-schedule-conservativeness` | 1.0 | SGLang schedule conservativeness factor | | `--sglang-schedule-conservativeness` | 1.0 | SGLang schedule conservativeness factor |
| `--aic-perf-model` | False | Use AIC SDK for latency prediction instead of interpolated/polynomial models. Requires `aiconfigurator` SDK installed (install with `pip install ai-dynamo[mocker]`) | | `--aic-perf-model` | False | Use AIC SDK for latency prediction instead of interpolated/polynomial models. Opt-in only: default mocker and replay paths do not use AIC. Requires `aiconfigurator` installed and usable AIC systems/perf data for the requested `system/backend/version` tuple |
| `--aic-system` | `h200_sxm` | AIC system name (e.g., `h200_sxm`). Used with `--aic-perf-model` | | `--aic-system` | `h200_sxm` | AIC system name (e.g., `h200_sxm`). Used with `--aic-perf-model` |
| `--aic-backend-version` | Auto | AIC backend engine version (e.g., `0.12.0` for vLLM). If not set, uses the default version for the backend | | `--aic-backend-version` | Auto | AIC backend engine version (e.g., `0.12.0` for vLLM). If not set, uses the default version for the backend |
| `--aic-tp-size` | 1 | Tensor parallel size for AIC latency prediction. Only affects AIC performance model lookups, not mocker scheduling | | `--aic-tp-size` | 1 | Tensor parallel size for AIC latency prediction. Only affects AIC performance model lookups, not mocker scheduling |
...@@ -126,10 +126,12 @@ python -m dynamo.mocker \ ...@@ -126,10 +126,12 @@ python -m dynamo.mocker \
The mocker supports replaying Mooncake-style traces through the dedicated replay CLI, which exposes The mocker supports replaying Mooncake-style traces through the dedicated replay CLI, which exposes
`offline|online`, `round_robin|kv_router`, `arrival_speedup_ratio`, closed-loop concurrency `offline|online`, `round_robin|kv_router`, `arrival_speedup_ratio`, closed-loop concurrency
admission, and synthetic workload generation directly: admission, synthetic workload generation, and offline disaggregated prefill/decode replay directly:
The replay CLI defaults to `--replay-mode offline` and `--router-mode round_robin`. Engine settings The replay CLI defaults to `--replay-mode offline` and `--router-mode round_robin`. Aggregated
such as `block_size`, `engine_type`, and compute speedups still belong in `--extra-engine-args`. replay uses `--extra-engine-args`. Offline disagg replay instead uses
`--prefill-engine-args` plus `--decode-engine-args`, together with
`--num-prefill-workers` and `--num-decode-workers`.
```bash ```bash
python -m dynamo.replay /path/to/mooncake_trace.jsonl \ python -m dynamo.replay /path/to/mooncake_trace.jsonl \
...@@ -197,6 +199,31 @@ Replay supports aggregated `vllm` and `sglang` engine configs. Internally replay ...@@ -197,6 +199,31 @@ Replay supports aggregated `vllm` and `sglang` engine configs. Internally replay
`block_size`; for `sglang`, `sglang.page_size` is still accepted as a compatibility alias as long `block_size`; for `sglang`, `sglang.page_size` is still accepted as a compatibility alias as long
as it matches `block_size` when both are provided. as it matches `block_size` when both are provided.
Offline replay also supports disaggregated `kv_router` mode. In that mode:
- `--prefill-engine-args` must describe a prefill worker
- `--decode-engine-args` must describe a decode worker
- `--router-mode` must be `kv_router`
- only offline replay is supported
Example:
```bash
python -m dynamo.replay \
--input-tokens 4096 \
--output-tokens 256 \
--request-count 100 \
--replay-mode offline \
--router-mode kv_router \
--replay-concurrency 32 \
--num-prefill-workers 2 \
--num-decode-workers 6 \
--prefill-engine-args '{"worker_type":"prefill","block_size":512}' \
--decode-engine-args '{"worker_type":"decode","block_size":512}' \
--router-config '{"router_queue_policy":"wspt"}' \
--report-json /tmp/replay-report.json
```
## Performance Modeling Setup ## Performance Modeling Setup
By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either: By default, the mocker uses hardcoded polynomial formulas to estimate prefill and decode timing. For more realistic simulations, pass `--planner-profile-data` with either:
...@@ -223,7 +250,7 @@ python -m dynamo.mocker \ ...@@ -223,7 +250,7 @@ python -m dynamo.mocker \
To use the AIC SDK for latency prediction: To use the AIC SDK for latency prediction:
```bash ```bash
pip install ai-dynamo[mocker] uv pip install '.[mocker]'
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path nvidia/Llama-3.1-8B-Instruct-FP8 \ --model-path nvidia/Llama-3.1-8B-Instruct-FP8 \
...@@ -234,13 +261,33 @@ python -m dynamo.mocker \ ...@@ -234,13 +261,33 @@ python -m dynamo.mocker \
The AIC model automatically uses `--model-path` and `--engine-type` to select the appropriate performance data. Available systems include `h200_sxm`, `h100_sxm`, etc. (see AIC SDK documentation for the full list). The AIC model automatically uses `--model-path` and `--engine-type` to select the appropriate performance data. Available systems include `h200_sxm`, `h100_sxm`, etc. (see AIC SDK documentation for the full list).
When using `python -m dynamo.replay`, there are no dedicated AIC flags. Pass the equivalent fields directly via `--extra-engine-args`: Important notes:
- AIC is opt-in. If you do not pass `--aic-perf-model`, `python -m dynamo.mocker` does not use AIC.
- `python -m dynamo.replay` also does not use AIC unless you explicitly put AIC fields in the engine-args JSON.
- `aiconfigurator` must be able to load the requested performance database for the selected `system/backend/version`. If the SDK is installed but the backing systems data is missing or unreadable, mocker now fails fast at startup with a clear error instead of failing later on first request.
- In development environments, this may require pointing Python at a source checkout of `aiconfigurator` with real Git LFS payloads materialized in its `systems/` directory.
When using `python -m dynamo.replay`, there are no dedicated AIC flags. For aggregated replay,
pass the equivalent fields via `--extra-engine-args`:
```bash ```bash
python -m dynamo.replay /path/to/trace.jsonl \ python -m dynamo.replay /path/to/trace.jsonl \
--extra-engine-args '{"aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1}' --extra-engine-args '{"aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1}'
``` ```
For offline disagg replay, pass the staged engine configs instead:
```bash
python -m dynamo.replay /path/to/trace.jsonl \
--replay-mode offline \
--router-mode kv_router \
--prefill-engine-args '{"worker_type":"prefill","aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1,"block_size":512}' \
--decode-engine-args '{"worker_type":"decode","aic_backend":"vllm","aic_system":"h200_sxm","aic_model_path":"nvidia/Llama-3.1-8B-Instruct-FP8","aic_tp_size":1,"block_size":512}' \
--num-prefill-workers 2 \
--num-decode-workers 6
```
The `aic_backend` field enables the AIC perf model and should match `engine_type` (`"vllm"` or `"sglang"`). The `aic_model_path` field is the equivalent of `--model-path` in `dynamo.mocker`. The `aic_backend` field enables the AIC perf model and should match `engine_type` (`"vllm"` or `"sglang"`). The `aic_model_path` field is the equivalent of `--model-path` in `dynamo.mocker`.
Example `--reasoning` configuration: Example `--reasoning` configuration:
...@@ -355,7 +402,7 @@ The mocker supports three timing prediction modes: ...@@ -355,7 +402,7 @@ The mocker supports three timing prediction modes:
**Interpolated Model:** Loads actual profiling data from an NPZ file containing measured prefill and decode latencies. The mocker interpolates between data points to predict timing for any input size. This enables high-fidelity simulation matching a specific hardware configuration. **Interpolated Model:** Loads actual profiling data from an NPZ file containing measured prefill and decode latencies. The mocker interpolates between data points to predict timing for any input size. This enables high-fidelity simulation matching a specific hardware configuration.
**AIC Model (`--aic-perf-model`):** Uses the NVIDIA AI Configurator (AIC) SDK for latency prediction. AIC provides calibrated performance models for specific GPU/model/engine combinations, predicting prefill and decode latency as a function of batch size, sequence length, and prefix cache hits. The model path is automatically derived from `--model-path`, and the engine type from `--engine-type`. This mode requires the `aiconfigurator` SDK, installable via `pip install ai-dynamo[mocker]`. **AIC Model (`--aic-perf-model`):** Uses the NVIDIA AI Configurator (AIC) SDK for latency prediction. AIC provides calibrated performance models for specific GPU/model/engine combinations, predicting prefill and decode latency as a function of batch size, sequence length, and prefix cache hits. The model path is automatically derived from `--model-path`, and the engine type from `--engine-type`. This mode is opt-in and requires both the `aiconfigurator` SDK and loadable systems/perf data for the requested tuple.
### Bootstrap Rendezvous (Disaggregated Serving) ### Bootstrap Rendezvous (Disaggregated Serving)
......
...@@ -384,6 +384,7 @@ async fn apply_entry( ...@@ -384,6 +384,7 @@ async fn apply_entry(
token_sequence: Some(block_hashes), token_sequence: Some(block_hashes),
isl, isl,
overlap: 0, overlap: 0,
track_prefill_tokens: true,
expected_output_tokens: Some(output_length as u32), expected_output_tokens: Some(output_length as u32),
worker, worker,
lora_name: None, lora_name: None,
......
...@@ -497,6 +497,8 @@ impl RouterHandles { ...@@ -497,6 +497,8 @@ impl RouterHandles {
let config_override = if is_disaggregated { let config_override = if is_disaggregated {
Some(RouterConfigOverride { Some(RouterConfigOverride {
overlap_score_weight: Some(0.0), overlap_score_weight: Some(0.0),
assume_kv_reuse: Some(false),
track_prefill_tokens: Some(false),
..Default::default() ..Default::default()
}) })
} else { } else {
...@@ -573,6 +575,9 @@ fn kv_router_config_from_env() -> KvRouterConfig { ...@@ -573,6 +575,9 @@ fn kv_router_config_from_env() -> KvRouterConfig {
if let Some(v) = env_bool("DYN_ROUTER_TRACK_OUTPUT_BLOCKS") { if let Some(v) = env_bool("DYN_ROUTER_TRACK_OUTPUT_BLOCKS") {
cfg.router_track_output_blocks = v; cfg.router_track_output_blocks = v;
} }
if let Some(v) = env_bool("DYN_ROUTER_TRACK_PREFILL_TOKENS") {
cfg.router_track_prefill_tokens = v;
}
if let Some(v) = env_f64("DYN_ROUTER_QUEUE_THRESHOLD") { if let Some(v) = env_f64("DYN_ROUTER_QUEUE_THRESHOLD") {
cfg.router_queue_threshold = Some(v); cfg.router_queue_threshold = Some(v);
} }
...@@ -584,6 +589,7 @@ fn kv_router_config_from_env() -> KvRouterConfig { ...@@ -584,6 +589,7 @@ fn kv_router_config_from_env() -> KvRouterConfig {
router_replica_sync = cfg.router_replica_sync, router_replica_sync = cfg.router_replica_sync,
router_track_active_blocks = cfg.router_track_active_blocks, router_track_active_blocks = cfg.router_track_active_blocks,
router_track_output_blocks = cfg.router_track_output_blocks, router_track_output_blocks = cfg.router_track_output_blocks,
router_track_prefill_tokens = cfg.router_track_prefill_tokens,
router_queue_threshold = ?cfg.router_queue_threshold, router_queue_threshold = ?cfg.router_queue_threshold,
"KvRouterConfig initialized (DYN_* env overrides applied)" "KvRouterConfig initialized (DYN_* env overrides applied)"
); );
...@@ -862,6 +868,12 @@ pub unsafe extern "C" fn add_request( ...@@ -862,6 +868,12 @@ pub unsafe extern "C" fn add_request(
tokio::time::timeout(timeout_duration, async { tokio::time::timeout(timeout_duration, async {
let worker = WorkerWithDpRank::new(worker_id, dp_rank); let worker = WorkerWithDpRank::new(worker_id, dp_rank);
let router_config_override = RouterConfigOverride {
overlap_score_weight: Some(0.0),
assume_kv_reuse: Some(false),
track_prefill_tokens: Some(false),
..Default::default()
};
// Compute overlap_blocks using the public method // Compute overlap_blocks using the public method
let overlap_blocks = match decode_router let overlap_blocks = match decode_router
...@@ -884,7 +896,7 @@ pub unsafe extern "C" fn add_request( ...@@ -884,7 +896,7 @@ pub unsafe extern "C" fn add_request(
None, None,
worker, worker,
None, // lora_name None, // lora_name
None, // router_config_override Some(&router_config_override),
) )
.await; .await;
......
...@@ -58,7 +58,7 @@ impl KvRouterConfig { ...@@ -58,7 +58,7 @@ impl KvRouterConfig {
#[pymethods] #[pymethods]
impl KvRouterConfig { impl KvRouterConfig {
#[new] #[new]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_enable_cache_control=false, min_initial_workers=1, router_queue_policy="fcfs", remote_indexer_component=None))] #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_enable_cache_control=false, min_initial_workers=1, router_queue_policy="fcfs", remote_indexer_component=None))]
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn new( fn new(
overlap_score_weight: f64, overlap_score_weight: f64,
...@@ -69,6 +69,7 @@ impl KvRouterConfig { ...@@ -69,6 +69,7 @@ impl KvRouterConfig {
router_track_active_blocks: bool, router_track_active_blocks: bool,
router_track_output_blocks: bool, router_track_output_blocks: bool,
router_assume_kv_reuse: bool, router_assume_kv_reuse: bool,
router_track_prefill_tokens: bool,
router_snapshot_threshold: Option<u32>, router_snapshot_threshold: Option<u32>,
router_reset_states: bool, router_reset_states: bool,
router_ttl_secs: f64, router_ttl_secs: f64,
...@@ -91,6 +92,7 @@ impl KvRouterConfig { ...@@ -91,6 +92,7 @@ impl KvRouterConfig {
router_track_active_blocks, router_track_active_blocks,
router_track_output_blocks, router_track_output_blocks,
router_assume_kv_reuse, router_assume_kv_reuse,
router_track_prefill_tokens,
router_snapshot_threshold, router_snapshot_threshold,
router_reset_states, router_reset_states,
router_ttl_secs, router_ttl_secs,
......
This diff is collapsed.
...@@ -1217,6 +1217,7 @@ class MockEngineArgs: ...@@ -1217,6 +1217,7 @@ class MockEngineArgs:
dp_size: int = 1, dp_size: int = 1,
startup_time: Optional[float] = None, startup_time: Optional[float] = None,
worker_type: str = "aggregated", worker_type: str = "aggregated",
planner_profile_data: Optional[str | os.PathLike[str]] = None,
aic_backend: Optional[str] = None, aic_backend: Optional[str] = None,
aic_system: Optional[str] = None, aic_system: Optional[str] = None,
aic_backend_version: Optional[str] = None, aic_backend_version: Optional[str] = None,
...@@ -1239,6 +1240,8 @@ class MockEngineArgs: ...@@ -1239,6 +1240,8 @@ class MockEngineArgs:
def from_json(config_json: str) -> "MockEngineArgs": def from_json(config_json: str) -> "MockEngineArgs":
... ...
def dump_json(self) -> str: ...
@property @property
def block_size(self) -> int: ... def block_size(self) -> int: ...
...@@ -1376,8 +1379,12 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi ...@@ -1376,8 +1379,12 @@ async def run_input(runtime: DistributedRuntime, input: str, engine_config: Engi
def run_mocker_trace_replay( def run_mocker_trace_replay(
trace_file: str | os.PathLike[str], trace_file: str | os.PathLike[str],
extra_engine_args: Optional[MockEngineArgs] = None, extra_engine_args: Optional[MockEngineArgs] = None,
prefill_engine_args: Optional[MockEngineArgs] = None,
decode_engine_args: Optional[MockEngineArgs] = None,
router_config: Optional[KvRouterConfig] = None, router_config: Optional[KvRouterConfig] = None,
num_workers: int = 1, num_workers: int = 1,
num_prefill_workers: int = 1,
num_decode_workers: int = 1,
replay_concurrency: Optional[int] = None, replay_concurrency: Optional[int] = None,
replay_mode: Literal["offline", "online"] = "offline", replay_mode: Literal["offline", "online"] = "offline",
router_mode: Literal["round_robin", "kv_router"] = "round_robin", router_mode: Literal["round_robin", "kv_router"] = "round_robin",
...@@ -1391,8 +1398,12 @@ def run_mocker_synthetic_trace_replay( ...@@ -1391,8 +1398,12 @@ def run_mocker_synthetic_trace_replay(
output_tokens: int, output_tokens: int,
request_count: int, request_count: int,
extra_engine_args: Optional[MockEngineArgs] = None, extra_engine_args: Optional[MockEngineArgs] = None,
prefill_engine_args: Optional[MockEngineArgs] = None,
decode_engine_args: Optional[MockEngineArgs] = None,
router_config: Optional[KvRouterConfig] = None, router_config: Optional[KvRouterConfig] = None,
num_workers: int = 1, num_workers: int = 1,
num_prefill_workers: int = 1,
num_decode_workers: int = 1,
replay_concurrency: Optional[int] = None, replay_concurrency: Optional[int] = None,
replay_mode: Literal["offline", "online"] = "offline", replay_mode: Literal["offline", "online"] = "offline",
router_mode: Literal["round_robin", "kv_router"] = "round_robin", router_mode: Literal["round_robin", "kv_router"] = "round_robin",
......
...@@ -11,8 +11,12 @@ def run_trace_replay( ...@@ -11,8 +11,12 @@ def run_trace_replay(
trace_file, trace_file,
*, *,
extra_engine_args=None, extra_engine_args=None,
prefill_engine_args=None,
decode_engine_args=None,
router_config=None, router_config=None,
num_workers=1, num_workers=1,
num_prefill_workers=1,
num_decode_workers=1,
replay_concurrency=None, replay_concurrency=None,
replay_mode="offline", replay_mode="offline",
router_mode="round_robin", router_mode="round_robin",
...@@ -21,8 +25,12 @@ def run_trace_replay( ...@@ -21,8 +25,12 @@ def run_trace_replay(
return _run_mocker_trace_replay( return _run_mocker_trace_replay(
trace_file, trace_file,
extra_engine_args=extra_engine_args, extra_engine_args=extra_engine_args,
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config, router_config=router_config,
num_workers=num_workers, num_workers=num_workers,
num_prefill_workers=num_prefill_workers,
num_decode_workers=num_decode_workers,
replay_concurrency=replay_concurrency, replay_concurrency=replay_concurrency,
replay_mode=replay_mode, replay_mode=replay_mode,
router_mode=router_mode, router_mode=router_mode,
...@@ -36,8 +44,12 @@ def run_synthetic_trace_replay( ...@@ -36,8 +44,12 @@ def run_synthetic_trace_replay(
request_count, request_count,
*, *,
extra_engine_args=None, extra_engine_args=None,
prefill_engine_args=None,
decode_engine_args=None,
router_config=None, router_config=None,
num_workers=1, num_workers=1,
num_prefill_workers=1,
num_decode_workers=1,
replay_concurrency=None, replay_concurrency=None,
replay_mode="offline", replay_mode="offline",
router_mode="round_robin", router_mode="round_robin",
...@@ -53,8 +65,12 @@ def run_synthetic_trace_replay( ...@@ -53,8 +65,12 @@ def run_synthetic_trace_replay(
output_tokens, output_tokens,
request_count, request_count,
extra_engine_args=extra_engine_args, extra_engine_args=extra_engine_args,
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config, router_config=router_config,
num_workers=num_workers, num_workers=num_workers,
num_prefill_workers=num_prefill_workers,
num_decode_workers=num_decode_workers,
replay_concurrency=replay_concurrency, replay_concurrency=replay_concurrency,
replay_mode=replay_mode, replay_mode=replay_mode,
router_mode=router_mode, router_mode=router_mode,
......
...@@ -4,24 +4,80 @@ ...@@ -4,24 +4,80 @@
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import importlib
import json import json
import os import os
import sys import sys
from collections.abc import Sequence from collections.abc import Sequence
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
from typing import Protocol
os.environ.setdefault("DYNAMO_SKIP_PYTHON_LOG_INIT", "1") os.environ.setdefault("DYNAMO_SKIP_PYTHON_LOG_INIT", "1")
from dynamo.llm import KvRouterConfig, MockEngineArgs from dynamo.llm import KvRouterConfig, MockEngineArgs
from dynamo.mocker.args import resolve_planner_profile_data
from dynamo.replay import run_synthetic_trace_replay, run_trace_replay from dynamo.replay import run_synthetic_trace_replay, run_trace_replay
from dynamo.replay.reporting import format_report_table, write_report_json from dynamo.replay.reporting import format_report_table, write_report_json
class PlannerProfileDataResult(Protocol):
npz_path: Path | None
def resolve_planner_profile_data(
planner_profile_data: Path | None,
) -> PlannerProfileDataResult:
try:
module = importlib.import_module("dynamo.mocker.args")
except ImportError:
if planner_profile_data is None:
return SimpleNamespace(npz_path=None)
return SimpleNamespace(
npz_path=planner_profile_data
if planner_profile_data.suffix == ".npz"
else None
)
return module.resolve_planner_profile_data(planner_profile_data)
def _load_engine_args(raw_args: str | None):
if raw_args is None:
return None
raw = json.loads(raw_args)
if not isinstance(raw, dict):
raise ValueError("engine-args must be a JSON object")
worker_type = raw.pop("worker_type", None)
if worker_type is not None:
if "is_prefill" in raw or "is_decode" in raw:
raise ValueError(
"worker_type cannot be combined with is_prefill or is_decode"
)
if worker_type == "prefill":
raw["is_prefill"] = True
elif worker_type == "decode":
raw["is_decode"] = True
elif worker_type != "aggregated":
raise ValueError(
"worker_type must be one of 'aggregated', 'prefill', or 'decode'"
)
if "planner_profile_data" in raw:
profile_data_result = resolve_planner_profile_data(
Path(raw["planner_profile_data"])
)
if profile_data_result.npz_path is not None:
raw["planner_profile_data"] = str(profile_data_result.npz_path)
else:
del raw["planner_profile_data"]
return MockEngineArgs.from_json(json.dumps(raw))
def main(argv: Sequence[str] | None = None) -> int: def main(argv: Sequence[str] | None = None) -> int:
parser = argparse.ArgumentParser(prog="python -m dynamo.replay") parser = argparse.ArgumentParser(prog="python -m dynamo.replay")
parser.add_argument("trace_file", nargs="?") parser.add_argument("trace_file", nargs="?")
parser.add_argument("--extra-engine-args") parser.add_argument("--extra-engine-args")
parser.add_argument("--prefill-engine-args")
parser.add_argument("--decode-engine-args")
parser.add_argument("--router-config") parser.add_argument("--router-config")
parser.add_argument("--input-tokens", type=int) parser.add_argument("--input-tokens", type=int)
parser.add_argument("--output-tokens", type=int) parser.add_argument("--output-tokens", type=int)
...@@ -36,6 +92,8 @@ def main(argv: Sequence[str] | None = None) -> int: ...@@ -36,6 +92,8 @@ def main(argv: Sequence[str] | None = None) -> int:
parser.add_argument("--num-prefix-groups", type=int, default=0) parser.add_argument("--num-prefix-groups", type=int, default=0)
parser.add_argument("--inter-turn-delay-ms", type=float, default=0.0) parser.add_argument("--inter-turn-delay-ms", type=float, default=0.0)
parser.add_argument("--num-workers", type=int, default=1) parser.add_argument("--num-workers", type=int, default=1)
parser.add_argument("--num-prefill-workers", type=int, default=1)
parser.add_argument("--num-decode-workers", type=int, default=1)
parser.add_argument("--replay-concurrency", type=int) parser.add_argument("--replay-concurrency", type=int)
parser.add_argument( parser.add_argument(
"--replay-mode", "--replay-mode",
...@@ -74,24 +132,9 @@ def main(argv: Sequence[str] | None = None) -> int: ...@@ -74,24 +132,9 @@ def main(argv: Sequence[str] | None = None) -> int:
"synthetic replay requires --input-tokens, --output-tokens, and --request-count" "synthetic replay requires --input-tokens, --output-tokens, and --request-count"
) )
# Resolve planner_profile_data directory -> NPZ before passing to Rust. extra_engine_args = _load_engine_args(args.extra_engine_args)
# Rust only accepts NPZ files; resolve_planner_profile_data handles conversion. prefill_engine_args = _load_engine_args(args.prefill_engine_args)
profile_data_result = None decode_engine_args = _load_engine_args(args.decode_engine_args)
if args.extra_engine_args is not None:
raw = json.loads(args.extra_engine_args)
if "planner_profile_data" in raw:
profile_data_result = resolve_planner_profile_data(
Path(raw["planner_profile_data"])
)
if profile_data_result.npz_path is not None:
raw["planner_profile_data"] = str(profile_data_result.npz_path)
else:
del raw["planner_profile_data"]
extra_engine_args = MockEngineArgs.from_json(json.dumps(raw))
else:
extra_engine_args = MockEngineArgs.from_json(args.extra_engine_args)
else:
extra_engine_args = None
router_config = ( router_config = (
KvRouterConfig.from_json(args.router_config) KvRouterConfig.from_json(args.router_config)
if args.router_config is not None if args.router_config is not None
...@@ -102,8 +145,12 @@ def main(argv: Sequence[str] | None = None) -> int: ...@@ -102,8 +145,12 @@ def main(argv: Sequence[str] | None = None) -> int:
report = run_trace_replay( report = run_trace_replay(
args.trace_file, args.trace_file,
extra_engine_args=extra_engine_args, extra_engine_args=extra_engine_args,
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config, router_config=router_config,
num_workers=args.num_workers, num_workers=args.num_workers,
num_prefill_workers=args.num_prefill_workers,
num_decode_workers=args.num_decode_workers,
replay_concurrency=args.replay_concurrency, replay_concurrency=args.replay_concurrency,
replay_mode=args.replay_mode, replay_mode=args.replay_mode,
router_mode=args.router_mode, router_mode=args.router_mode,
...@@ -115,8 +162,12 @@ def main(argv: Sequence[str] | None = None) -> int: ...@@ -115,8 +162,12 @@ def main(argv: Sequence[str] | None = None) -> int:
args.output_tokens, args.output_tokens,
args.request_count, args.request_count,
extra_engine_args=extra_engine_args, extra_engine_args=extra_engine_args,
prefill_engine_args=prefill_engine_args,
decode_engine_args=decode_engine_args,
router_config=router_config, router_config=router_config,
num_workers=args.num_workers, num_workers=args.num_workers,
num_prefill_workers=args.num_prefill_workers,
num_decode_workers=args.num_decode_workers,
replay_concurrency=args.replay_concurrency, replay_concurrency=args.replay_concurrency,
replay_mode=args.replay_mode, replay_mode=args.replay_mode,
router_mode=args.router_mode, router_mode=args.router_mode,
......
...@@ -5,12 +5,13 @@ import json ...@@ -5,12 +5,13 @@ import json
import os import os
import subprocess import subprocess
import sys import sys
from pathlib import Path
import numpy as np
import pytest import pytest
from dynamo.llm import KvRouterConfig, MockEngineArgs from dynamo.llm import KvRouterConfig, MockEngineArgs
from dynamo.replay import run_synthetic_trace_replay, run_trace_replay from dynamo.replay import run_synthetic_trace_replay, run_trace_replay
from dynamo.replay.main import main
from dynamo.replay.reporting import format_report_table, write_report_json from dynamo.replay.reporting import format_report_table, write_report_json
pytestmark = [ pytestmark = [
...@@ -75,6 +76,7 @@ def _router_config_payload(): ...@@ -75,6 +76,7 @@ def _router_config_payload():
"router_track_active_blocks": True, "router_track_active_blocks": True,
"router_track_output_blocks": False, "router_track_output_blocks": False,
"router_assume_kv_reuse": True, "router_assume_kv_reuse": True,
"router_track_prefill_tokens": True,
"router_snapshot_threshold": 1000000, "router_snapshot_threshold": 1000000,
"router_reset_states": False, "router_reset_states": False,
"router_ttl_secs": 120.0, "router_ttl_secs": 120.0,
...@@ -194,6 +196,14 @@ def _sglang_args(): ...@@ -194,6 +196,14 @@ def _sglang_args():
return MockEngineArgs.from_json(json.dumps(_sglang_args_payload())) return MockEngineArgs.from_json(json.dumps(_sglang_args_payload()))
def _prefill_args():
return MockEngineArgs(block_size=64, speedup_ratio=1000.0, worker_type="prefill")
def _decode_args():
return MockEngineArgs(block_size=64, speedup_ratio=1000.0, worker_type="decode")
def _write_router_config(tmp_path): def _write_router_config(tmp_path):
config_path = tmp_path / "router_config.json" config_path = tmp_path / "router_config.json"
config_path.write_text( config_path.write_text(
...@@ -229,8 +239,12 @@ def _assert_basic_report_metrics(report): ...@@ -229,8 +239,12 @@ def _assert_basic_report_metrics(report):
def _replay_cli_env() -> dict[str, str]: def _replay_cli_env() -> dict[str, str]:
repo_root = Path(__file__).resolve().parents[4]
env = os.environ.copy() env = os.environ.copy()
pythonpath_entries = ["lib/bindings/python/src", "components/src"] pythonpath_entries = [
str(repo_root / "lib/bindings/python/src"),
str(repo_root / "components/src"),
]
existing_pythonpath = env.get("PYTHONPATH") existing_pythonpath = env.get("PYTHONPATH")
if existing_pythonpath: if existing_pythonpath:
pythonpath_entries.append(existing_pythonpath) pythonpath_entries.append(existing_pythonpath)
...@@ -238,6 +252,33 @@ def _replay_cli_env() -> dict[str, str]: ...@@ -238,6 +252,33 @@ def _replay_cli_env() -> dict[str, str]:
return env return env
def _planner_profile_data_npz_path() -> Path:
return (
Path(__file__).resolve().parents[4]
/ "benchmarks/results/H200_TP1P_TP1D_perf_data.npz"
)
def _planner_profile_data_dir_path() -> Path:
return (
Path(__file__).resolve().parents[4]
/ "tests/planner/profiling_results/H200_TP1P_TP1D"
)
def _write_planner_profile_data_npz(tmp_path: Path) -> Path:
planner_profile_data = tmp_path / "planner_profile_data.npz"
np.savez(
planner_profile_data,
prefill_isl=np.array([128.0, 256.0]),
prefill_ttft_ms=np.array([4.0, 8.0]),
decode_active_kv_tokens=np.array([1024.0, 2048.0]),
decode_context_length=np.array([128.0, 256.0]),
decode_itl=np.array([[1.0, 1.5], [2.0, 2.5]]),
)
return planner_profile_data
def _run_replay_cli(tmp_path, *args): def _run_replay_cli(tmp_path, *args):
return subprocess.run( return subprocess.run(
[ [
...@@ -264,11 +305,27 @@ def _assert_replay_cli_outputs(completed, report_path): ...@@ -264,11 +305,27 @@ def _assert_replay_cli_outputs(completed, report_path):
@pytest.mark.parametrize("engine_type", ["vllm", "sglang"]) @pytest.mark.parametrize("engine_type", ["vllm", "sglang"])
@pytest.mark.parametrize("replay_mode", ["offline", "online"]) @pytest.mark.parametrize("replay_mode", ["offline", "online"])
@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"]) @pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
def test_run_trace_replay_smoke_matrix(tmp_path, engine_type, replay_mode, router_mode): @pytest.mark.parametrize("serving_mode", ["agg", "disagg"])
def test_run_trace_replay_smoke_matrix(
tmp_path, engine_type, replay_mode, router_mode, serving_mode
):
trace_path = _write_trace_and_args(tmp_path) trace_path = _write_trace_and_args(tmp_path)
if serving_mode == "disagg":
if replay_mode != "offline":
pytest.skip("disagg replay only supports offline mode")
report = run_trace_replay(
trace_path,
prefill_engine_args=_prefill_args(),
decode_engine_args=_decode_args(),
router_config=_router_config(),
num_prefill_workers=2,
num_decode_workers=2,
replay_mode=replay_mode,
router_mode=router_mode,
)
else:
args_path = _vllm_args() if engine_type == "vllm" else _sglang_args() args_path = _vllm_args() if engine_type == "vllm" else _sglang_args()
num_workers = 1 if router_mode == "round_robin" else 2 num_workers = 1 if router_mode == "round_robin" else 2
report = run_trace_replay( report = run_trace_replay(
trace_path, trace_path,
extra_engine_args=args_path, extra_engine_args=args_path,
...@@ -345,12 +402,29 @@ def test_run_trace_replay_supports_multiturn_sessions(tmp_path, replay_mode): ...@@ -345,12 +402,29 @@ def test_run_trace_replay_supports_multiturn_sessions(tmp_path, replay_mode):
@pytest.mark.parametrize("engine_type", ["vllm", "sglang"]) @pytest.mark.parametrize("engine_type", ["vllm", "sglang"])
@pytest.mark.parametrize("replay_mode", ["offline", "online"]) @pytest.mark.parametrize("replay_mode", ["offline", "online"])
@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"]) @pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
@pytest.mark.parametrize("serving_mode", ["agg", "disagg"])
def test_run_synthetic_trace_replay_smoke_matrix( def test_run_synthetic_trace_replay_smoke_matrix(
tmp_path, engine_type, replay_mode, router_mode tmp_path, engine_type, replay_mode, router_mode, serving_mode
): ):
if serving_mode == "disagg":
if replay_mode != "offline":
pytest.skip("disagg replay only supports offline mode")
report = run_synthetic_trace_replay(
64,
2,
2,
prefill_engine_args=_prefill_args(),
decode_engine_args=_decode_args(),
router_config=_router_config(),
num_prefill_workers=2,
num_decode_workers=2,
replay_mode=replay_mode,
router_mode=router_mode,
arrival_interval_ms=5.0,
)
else:
args_path = _vllm_args() if engine_type == "vllm" else _sglang_args() args_path = _vllm_args() if engine_type == "vllm" else _sglang_args()
num_workers = 1 if router_mode == "round_robin" else 2 num_workers = 1 if router_mode == "round_robin" else 2
report = run_synthetic_trace_replay( report = run_synthetic_trace_replay(
64, 64,
2, 2,
...@@ -553,6 +627,103 @@ def test_run_trace_replay_accepts_partial_extra_engine_args_json(tmp_path, repla ...@@ -553,6 +627,103 @@ def test_run_trace_replay_accepts_partial_extra_engine_args_json(tmp_path, repla
) )
@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
def test_run_trace_replay_supports_disagg_offline(tmp_path, router_mode):
trace_path = _write_trace_and_args(tmp_path)
report = run_trace_replay(
trace_path,
prefill_engine_args=_prefill_args(),
decode_engine_args=_decode_args(),
router_config=_router_config(),
num_prefill_workers=2,
num_decode_workers=2,
replay_mode="offline",
router_mode=router_mode,
)
_assert_basic_report_counts(
report,
num_requests=2,
input_tokens=64,
output_tokens=2,
)
_assert_basic_report_metrics(report)
@pytest.mark.parametrize("router_mode", ["round_robin", "kv_router"])
def test_run_synthetic_trace_replay_disagg_preserves_expected_output_tokens(
router_mode,
):
report = run_synthetic_trace_replay(
128,
7,
6,
prefill_engine_args=_prefill_args(),
decode_engine_args=_decode_args(),
router_config=_router_config(),
num_prefill_workers=2,
num_decode_workers=2,
replay_mode="offline",
router_mode=router_mode,
)
_assert_basic_report_counts(
report,
num_requests=6,
input_tokens=128,
output_tokens=7,
)
_assert_basic_report_metrics(report)
def test_run_trace_replay_rejects_partial_disagg_args(tmp_path):
trace_path = _write_trace_and_args(tmp_path)
with pytest.raises(Exception, match="must be provided together"):
run_trace_replay(
trace_path,
prefill_engine_args=_prefill_args(),
replay_mode="offline",
router_mode="kv_router",
)
def test_run_trace_replay_rejects_online_disagg(tmp_path):
trace_path = _write_trace_and_args(tmp_path)
with pytest.raises(
Exception, match="disagg replay only supports replay_mode='offline'"
):
run_trace_replay(
trace_path,
prefill_engine_args=_prefill_args(),
decode_engine_args=_decode_args(),
router_config=_router_config(),
num_prefill_workers=2,
num_decode_workers=2,
replay_mode="online",
router_mode="kv_router",
)
def test_run_trace_replay_rejects_disagg_worker_counts_for_aggregated_mode(tmp_path):
trace_path = _write_trace_and_args(tmp_path)
with pytest.raises(
Exception,
match="num_prefill_workers and num_decode_workers are only used for disagg replay",
):
run_trace_replay(
trace_path,
extra_engine_args=MockEngineArgs(block_size=64, speedup_ratio=1000.0),
num_workers=1,
num_prefill_workers=2,
num_decode_workers=2,
replay_mode="offline",
)
def test_format_report_table_matches_aiperf_shape(): def test_format_report_table_matches_aiperf_shape():
report = { report = {
"mean_ttft_ms": 18.26, "mean_ttft_ms": 18.26,
...@@ -616,91 +787,49 @@ def test_write_report_json_creates_file(tmp_path): ...@@ -616,91 +787,49 @@ def test_write_report_json_creates_file(tmp_path):
) )
def test_replay_cli_prints_table_and_saves_json(tmp_path, monkeypatch, capsys): @pytest.mark.timeout(30)
report = { def test_replay_cli_subprocess_synthetic_smoke(tmp_path):
"mean_ttft_ms": 10.0, report_path = tmp_path / "synthetic_report.json"
"min_ttft_ms": 9.0,
"max_ttft_ms": 12.0,
"p99_ttft_ms": 12.0,
"p90_ttft_ms": 11.0,
"p75_ttft_ms": 10.5,
"std_ttft_ms": 1.0,
"output_throughput_tok_s": 123.0,
"request_throughput_rps": 4.0,
"completed_requests": 3,
}
def fake_run(*args, **kwargs):
return report
monkeypatch.setattr("dynamo.replay.main.run_synthetic_trace_replay", fake_run)
report_path = tmp_path / "cli_report.json"
exit_code = main( completed = _run_replay_cli(
[ tmp_path,
"--input-tokens", "--input-tokens",
"16", "250",
"--output-tokens", "--output-tokens",
"8", "25",
"--request-count", "--request-count",
"3", "10",
"--num-workers",
"4",
"--replay-concurrency",
"4",
"--report-json", "--report-json",
str(report_path), str(report_path),
] "--extra-engine-args",
'{"block_size":64,"speedup_ratio":1000.0}',
) )
assert exit_code == 0 report = _assert_replay_cli_outputs(completed, report_path)
stdout = capsys.readouterr().out _assert_basic_report_counts(
assert "NVIDIA AIPerf | LLM Metrics" in stdout report,
assert "Saved full report to:" in stdout num_requests=10,
assert '"completed_requests"' not in stdout input_tokens=250,
assert json.loads(report_path.read_text(encoding="utf-8")) == report output_tokens=25,
def test_replay_cli_passes_multiturn_workload_kwargs(monkeypatch):
captured = {}
def fake_run(*args, **kwargs):
captured["args"] = args
captured["kwargs"] = kwargs
return {
"completed_requests": 4,
"request_throughput_rps": 1.0,
"output_throughput_tok_s": 1.0,
}
monkeypatch.setattr("dynamo.replay.main.run_synthetic_trace_replay", fake_run)
exit_code = main(
[
"--input-tokens",
"16",
"--output-tokens",
"8",
"--request-count",
"2",
"--turns-per-session",
"2",
"--shared-prefix-ratio",
"0.5",
"--num-prefix-groups",
"3",
"--inter-turn-delay-ms",
"7.0",
]
) )
_assert_basic_report_metrics(report)
assert exit_code == 0
assert captured["args"] == (16, 8, 2)
assert captured["kwargs"]["turns_per_session"] == 2
assert captured["kwargs"]["shared_prefix_ratio"] == 0.5
assert captured["kwargs"]["num_prefix_groups"] == 3
assert captured["kwargs"]["inter_turn_delay_ms"] == 7.0
@pytest.mark.timeout(30) @pytest.mark.timeout(30)
def test_replay_cli_subprocess_synthetic_smoke(tmp_path): @pytest.mark.parametrize("planner_profile_data_kind", ["dir", "npz"])
report_path = tmp_path / "synthetic_report.json" def test_replay_cli_subprocess_synthetic_smoke_accepts_planner_profile_data(
tmp_path, planner_profile_data_kind
):
report_path = tmp_path / f"synthetic_report_{planner_profile_data_kind}.json"
planner_profile_data = (
_planner_profile_data_dir_path()
if planner_profile_data_kind == "dir"
else _write_planner_profile_data_npz(tmp_path)
)
completed = _run_replay_cli( completed = _run_replay_cli(
tmp_path, tmp_path,
...@@ -717,7 +846,13 @@ def test_replay_cli_subprocess_synthetic_smoke(tmp_path): ...@@ -717,7 +846,13 @@ def test_replay_cli_subprocess_synthetic_smoke(tmp_path):
"--report-json", "--report-json",
str(report_path), str(report_path),
"--extra-engine-args", "--extra-engine-args",
'{"block_size":64,"speedup_ratio":1000.0}', json.dumps(
{
"block_size": 64,
"speedup_ratio": 1000.0,
"planner_profile_data": str(planner_profile_data),
}
),
) )
report = _assert_replay_cli_outputs(completed, report_path) report = _assert_replay_cli_outputs(completed, report_path)
...@@ -798,6 +933,40 @@ def test_replay_cli_subprocess_trace_smoke(tmp_path): ...@@ -798,6 +933,40 @@ def test_replay_cli_subprocess_trace_smoke(tmp_path):
_assert_basic_report_metrics(report) _assert_basic_report_metrics(report)
@pytest.mark.timeout(30)
def test_replay_cli_subprocess_trace_disagg_smoke(tmp_path):
trace_path = _write_cli_smoke_trace(tmp_path)
report_path = tmp_path / "trace_disagg_report.json"
completed = _run_replay_cli(
tmp_path,
str(trace_path),
"--replay-mode",
"offline",
"--router-mode",
"kv_router",
"--num-prefill-workers",
"2",
"--num-decode-workers",
"2",
"--report-json",
str(report_path),
"--prefill-engine-args",
'{"block_size":64,"speedup_ratio":1000.0,"worker_type":"prefill"}',
"--decode-engine-args",
'{"block_size":64,"speedup_ratio":1000.0,"worker_type":"decode"}',
)
report = _assert_replay_cli_outputs(completed, report_path)
_assert_basic_report_counts(
report,
num_requests=10,
input_tokens=250,
output_tokens=25,
)
_assert_basic_report_metrics(report)
@pytest.mark.timeout(30) @pytest.mark.timeout(30)
def test_replay_cli_subprocess_multiturn_trace_smoke(tmp_path): def test_replay_cli_subprocess_multiturn_trace_smoke(tmp_path):
trace_path = _write_multiturn_trace(tmp_path) trace_path = _write_multiturn_trace(tmp_path)
......
...@@ -8,6 +8,10 @@ use rustc_hash::FxHashMap; ...@@ -8,6 +8,10 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use xxhash_rust::xxh3; use xxhash_rust::xxh3;
const fn default_track_prefill_tokens() -> bool {
true
}
/// The event subject that workers publish KV cache events on. /// The event subject that workers publish KV cache events on.
pub const KV_EVENT_SUBJECT: &str = "kv-events"; pub const KV_EVENT_SUBJECT: &str = "kv-events";
...@@ -431,6 +435,8 @@ pub enum ActiveSequenceEventData { ...@@ -431,6 +435,8 @@ pub enum ActiveSequenceEventData {
token_sequence: Option<Vec<SequenceHash>>, token_sequence: Option<Vec<SequenceHash>>,
isl: usize, isl: usize,
overlap: u32, overlap: u32,
#[serde(default = "default_track_prefill_tokens")]
track_prefill_tokens: bool,
expected_output_tokens: Option<u32>, expected_output_tokens: Option<u32>,
}, },
Free, Free,
...@@ -990,14 +996,6 @@ mod tests { ...@@ -990,14 +996,6 @@ mod tests {
assert_ne!(lora_a[0], lora_b[0]); assert_ne!(lora_a[0], lora_b[0]);
} }
#[test]
fn test_lora_name_none_matches_legacy() {
let tokens: Vec<u32> = (0..8).collect();
let hashes_none = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
let hashes_none2 = compute_block_hash_for_seq(&tokens, 4, BlockHashOptions::default());
assert_eq!(hashes_none, hashes_none2);
}
#[test] #[test]
fn test_lora_name_empty_string_normalized_to_none() { fn test_lora_name_empty_string_normalized_to_none() {
let tokens: Vec<u32> = (0..4).collect(); let tokens: Vec<u32> = (0..4).collect();
...@@ -1172,16 +1170,6 @@ mod tests { ...@@ -1172,16 +1170,6 @@ mod tests {
assert_eq!(deserialized.block_hashes[1].0, 5); assert_eq!(deserialized.block_hashes[1].0, 5);
} }
#[test]
fn test_router_request_mark_free_backwards_compatible_deserialization() {
let request: RouterRequest = serde_json::from_str(r#"{"method":"mark_free"}"#).unwrap();
assert!(matches!(
request,
RouterRequest::MarkFree { request_id: None }
));
}
#[test] #[test]
fn test_router_request_mark_free_serialization_with_request_id() { fn test_router_request_mark_free_serialization_with_request_id() {
let request = RouterRequest::MarkFree { let request = RouterRequest::MarkFree {
......
...@@ -17,6 +17,10 @@ const fn default_min_initial_workers() -> usize { ...@@ -17,6 +17,10 @@ const fn default_min_initial_workers() -> usize {
1 1
} }
const fn default_track_prefill_tokens() -> bool {
true
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum RouterQueuePolicy { pub enum RouterQueuePolicy {
...@@ -63,6 +67,9 @@ pub struct RouterConfigOverride { ...@@ -63,6 +67,9 @@ pub struct RouterConfigOverride {
#[builder(default)] #[builder(default)]
pub assume_kv_reuse: Option<bool>, pub assume_kv_reuse: Option<bool>,
#[builder(default)]
pub track_prefill_tokens: Option<bool>,
} }
/// KV Router configuration parameters /// KV Router configuration parameters
...@@ -98,6 +105,12 @@ pub struct KvRouterConfig { ...@@ -98,6 +105,12 @@ pub struct KvRouterConfig {
/// When false, generates random hashes (assuming no KV cache reuse). /// When false, generates random hashes (assuming no KV cache reuse).
pub router_assume_kv_reuse: bool, pub router_assume_kv_reuse: bool,
/// Whether to include prompt-side prefill tokens in active load accounting (default: true).
/// When false, prompt tokens are excluded from active prefill token tracking, queue pressure,
/// and potential prefill-token load calculations.
#[serde(default = "default_track_prefill_tokens")]
pub router_track_prefill_tokens: bool,
/// Threshold for triggering snapshots. If None, no snapshots will be performed. /// Threshold for triggering snapshots. If None, no snapshots will be performed.
#[validate(range(min = 1))] #[validate(range(min = 1))]
pub router_snapshot_threshold: Option<u32>, pub router_snapshot_threshold: Option<u32>,
...@@ -171,6 +184,7 @@ impl Default for KvRouterConfig { ...@@ -171,6 +184,7 @@ impl Default for KvRouterConfig {
router_track_active_blocks: true, router_track_active_blocks: true,
router_track_output_blocks: false, router_track_output_blocks: false,
router_assume_kv_reuse: true, router_assume_kv_reuse: true,
router_track_prefill_tokens: default_track_prefill_tokens(),
router_snapshot_threshold: Some(1000000), router_snapshot_threshold: Some(1000000),
router_reset_states: false, router_reset_states: false,
router_ttl_secs: 120.0, router_ttl_secs: 120.0,
...@@ -208,6 +222,18 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr ...@@ -208,6 +222,18 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr
} }
impl KvRouterConfig { impl KvRouterConfig {
pub fn assume_kv_reuse(&self, config_override: Option<&RouterConfigOverride>) -> bool {
config_override
.and_then(|cfg| cfg.assume_kv_reuse)
.unwrap_or(self.router_assume_kv_reuse)
}
pub fn track_prefill_tokens(&self, config_override: Option<&RouterConfigOverride>) -> bool {
config_override
.and_then(|cfg| cfg.track_prefill_tokens)
.unwrap_or(self.router_track_prefill_tokens)
}
/// Compute sequence hashes for active block tracking based on configuration. /// Compute sequence hashes for active block tracking based on configuration.
/// ///
/// Returns: /// Returns:
...@@ -231,9 +257,7 @@ impl KvRouterConfig { ...@@ -231,9 +257,7 @@ impl KvRouterConfig {
return Some(Vec::new()); return Some(Vec::new());
} }
let assume_kv_reuse = config_override let assume_kv_reuse = self.assume_kv_reuse(config_override);
.and_then(|cfg| cfg.assume_kv_reuse)
.unwrap_or(self.router_assume_kv_reuse);
if assume_kv_reuse { if assume_kv_reuse {
let block_hashes = match precomputed_block_hashes { let block_hashes = match precomputed_block_hashes {
...@@ -290,6 +314,11 @@ mod tests { ...@@ -290,6 +314,11 @@ mod tests {
assert_eq!(KvRouterConfig::default().min_initial_workers, 1); assert_eq!(KvRouterConfig::default().min_initial_workers, 1);
} }
#[test]
fn kv_router_config_defaults_to_tracking_prefill_tokens() {
assert!(KvRouterConfig::default().router_track_prefill_tokens);
}
#[test] #[test]
fn kv_router_config_rejects_zero_initial_workers() { fn kv_router_config_rejects_zero_initial_workers() {
let cfg = KvRouterConfig { let cfg = KvRouterConfig {
...@@ -332,6 +361,17 @@ mod tests { ...@@ -332,6 +361,17 @@ mod tests {
assert_ne!(without_mm, with_mm); assert_ne!(without_mm, with_mm);
} }
#[test]
fn router_config_override_serde_round_trip_preserves_track_prefill_tokens() {
let serialized = serde_json::to_string(&RouterConfigOverride {
track_prefill_tokens: Some(false),
..Default::default()
})
.unwrap();
let deserialized: RouterConfigOverride = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.track_prefill_tokens, Some(false));
}
#[test] #[test]
fn compute_seq_hashes_for_tracking_uses_precomputed_block_hashes() { fn compute_seq_hashes_for_tracking_uses_precomputed_block_hashes() {
let config = KvRouterConfig::default(); let config = KvRouterConfig::default();
......
...@@ -30,6 +30,7 @@ where ...@@ -30,6 +30,7 @@ where
request_tx: mpsc::Sender<SchedulingRequest>, request_tx: mpsc::Sender<SchedulingRequest>,
slots: Arc<ActiveSequencesMultiWorker<P>>, slots: Arc<ActiveSequencesMultiWorker<P>>,
queue: Arc<SchedulerQueue<P, C, S, Sel>>, queue: Arc<SchedulerQueue<P, C, S, Sel>>,
track_prefill_tokens_default: bool,
worker_type: &'static str, worker_type: &'static str,
} }
...@@ -48,6 +49,7 @@ where ...@@ -48,6 +49,7 @@ where
block_size: u32, block_size: u32,
selector: Sel, selector: Sel,
policy: S, policy: S,
track_prefill_tokens_default: bool,
cancellation_token: CancellationToken, cancellation_token: CancellationToken,
worker_type: &'static str, worker_type: &'static str,
monitor_worker_configs: bool, monitor_worker_configs: bool,
...@@ -135,6 +137,7 @@ where ...@@ -135,6 +137,7 @@ where
request_tx, request_tx,
slots, slots,
queue, queue,
track_prefill_tokens_default,
worker_type, worker_type,
} }
} }
...@@ -154,6 +157,9 @@ where ...@@ -154,6 +157,9 @@ where
allowed_worker_ids: Option<HashSet<WorkerId>>, allowed_worker_ids: Option<HashSet<WorkerId>>,
) -> Result<SchedulingResponse, KvSchedulerError> { ) -> Result<SchedulingResponse, KvSchedulerError> {
let (resp_tx, resp_rx) = tokio::sync::oneshot::channel(); let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
let track_prefill_tokens = router_config_override
.and_then(|cfg| cfg.track_prefill_tokens)
.unwrap_or(self.track_prefill_tokens_default);
let request = SchedulingRequest { let request = SchedulingRequest {
maybe_request_id, maybe_request_id,
token_seq, token_seq,
...@@ -161,6 +167,7 @@ where ...@@ -161,6 +167,7 @@ where
overlaps, overlaps,
decode_blocks: HashMap::new(), decode_blocks: HashMap::new(),
prefill_tokens: HashMap::new(), prefill_tokens: HashMap::new(),
track_prefill_tokens,
router_config_override: router_config_override.cloned(), router_config_override: router_config_override.cloned(),
update_states, update_states,
lora_name, lora_name,
...@@ -224,10 +231,16 @@ where ...@@ -224,10 +231,16 @@ where
token_seq: Option<Vec<SequenceHash>>, token_seq: Option<Vec<SequenceHash>>,
isl_tokens: usize, isl_tokens: usize,
overlaps: OverlapScores, overlaps: OverlapScores,
track_prefill_tokens: bool,
) -> Vec<PotentialLoad> { ) -> Vec<PotentialLoad> {
let (decode_blocks, prefill_tokens) = let (decode_blocks, prefill_tokens) = self
self.slots .slots
.potential_blocks_and_tokens(token_seq.as_deref(), isl_tokens, overlaps); .potential_blocks_and_tokens_with_prefill_tracking(
token_seq.as_deref(),
isl_tokens,
overlaps,
track_prefill_tokens,
);
let mut workers: HashSet<WorkerWithDpRank> = HashSet::new(); let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
workers.extend(decode_blocks.keys().copied()); workers.extend(decode_blocks.keys().copied());
...@@ -300,6 +313,7 @@ mod tests { ...@@ -300,6 +313,7 @@ mod tests {
64, 64,
DefaultWorkerSelector::new(None, "test"), DefaultWorkerSelector::new(None, "test"),
FcfsPolicy, FcfsPolicy,
true,
cancel_token.clone(), cancel_token.clone(),
"test", "test",
monitor_worker_configs, monitor_worker_configs,
...@@ -344,6 +358,48 @@ mod tests { ...@@ -344,6 +358,48 @@ mod tests {
cancel_token.cancel(); cancel_token.cancel();
} }
#[tokio::test]
async fn test_schedule_override_can_disable_prefill_tracking() {
let mut workers = HashMap::new();
workers.insert(
0,
SimpleWorkerConfig {
max_num_batched_tokens: Some(64),
..Default::default()
},
);
let (scheduler, slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true);
scheduler
.schedule(
Some("req-1".to_string()),
64,
Some(vec![1, 2, 3, 4]),
OverlapScores::default(),
Some(&crate::config::RouterConfigOverride {
track_prefill_tokens: Some(false),
..Default::default()
}),
true,
None,
0.0,
None,
None,
)
.await
.unwrap();
assert_eq!(
slots
.active_tokens()
.get(&WorkerWithDpRank::new(0, 0))
.copied(),
Some(0)
);
cancel_token.cancel();
}
#[tokio::test] #[tokio::test]
async fn test_mark_prefill_completed_drains_pending_queue() { async fn test_mark_prefill_completed_drains_pending_queue() {
let mut workers = HashMap::new(); let mut workers = HashMap::new();
...@@ -474,7 +530,7 @@ mod tests { ...@@ -474,7 +530,7 @@ mod tests {
.collect(); .collect();
expected.sort_by_key(|load| (load.worker_id, load.dp_rank)); expected.sort_by_key(|load| (load.worker_id, load.dp_rank));
let mut actual = scheduler.get_potential_loads(Some(token_seq), 128, overlaps); let mut actual = scheduler.get_potential_loads(Some(token_seq), 128, overlaps, true);
actual.sort_by_key(|load| (load.worker_id, load.dp_rank)); actual.sort_by_key(|load| (load.worker_id, load.dp_rank));
assert_eq!(actual.len(), expected.len()); assert_eq!(actual.len(), expected.len());
...@@ -500,7 +556,7 @@ mod tests { ...@@ -500,7 +556,7 @@ mod tests {
make_scheduler(HashMap::new(), None, false); make_scheduler(HashMap::new(), None, false);
scheduler.register_workers(&HashSet::from([42])); scheduler.register_workers(&HashSet::from([42]));
let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default()); let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default(), true);
assert_eq!(loads.len(), 1); assert_eq!(loads.len(), 1);
assert_eq!(loads[0].worker_id, 42); assert_eq!(loads[0].worker_id, 42);
...@@ -517,7 +573,7 @@ mod tests { ...@@ -517,7 +573,7 @@ mod tests {
assert_eq!( assert_eq!(
scheduler scheduler
.get_potential_loads(None, 64, OverlapScores::default()) .get_potential_loads(None, 64, OverlapScores::default(), true)
.len(), .len(),
1 1
); );
...@@ -536,7 +592,7 @@ mod tests { ...@@ -536,7 +592,7 @@ mod tests {
tokio::time::timeout(Duration::from_secs(1), async { tokio::time::timeout(Duration::from_secs(1), async {
loop { loop {
if scheduler if scheduler
.get_potential_loads(None, 64, OverlapScores::default()) .get_potential_loads(None, 64, OverlapScores::default(), true)
.len() .len()
== 3 == 3
{ {
...@@ -550,4 +606,39 @@ mod tests { ...@@ -550,4 +606,39 @@ mod tests {
cancel_token.cancel(); cancel_token.cancel();
} }
#[tokio::test]
async fn test_get_potential_loads_can_ignore_prefill_tokens() {
let mut workers = HashMap::new();
workers.insert(
0,
SimpleWorkerConfig {
max_num_batched_tokens: Some(256),
..Default::default()
},
);
let (scheduler, _slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true);
scheduler
.schedule(
Some("req-1".to_string()),
64,
Some(vec![11, 22]),
OverlapScores::default(),
None,
true,
None,
0.0,
None,
None,
)
.await
.unwrap();
let loads = scheduler.get_potential_loads(None, 64, OverlapScores::default(), false);
assert_eq!(loads.len(), 1);
assert_eq!(loads[0].potential_prefill_tokens, 64);
cancel_token.cancel();
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment