"vscode:/vscode.git/clone" did not exist on "3a4252095a3b5db2f9b28a882f01fede523fbbb0"
Unverified Commit 44a76f96 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

refactor: update frontend kv-router flags to be consistent with router (#6361)

parent b2075619
...@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup): ...@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup):
help="Interactive text chat.\nenv var: DYN_INTERACTIVE", help="Interactive text chat.\nenv var: DYN_INTERACTIVE",
) )
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_argument( add_argument(
g, g,
flag_name="--kv-cache-block-size", flag_name="--kv-cache-block-size",
...@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup): ...@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup):
) )
add_argument( add_argument(
g, g,
flag_name="--kv-overlap-score-weight", flag_name="--router-kv-overlap-score-weight",
env_var="DYN_KV_OVERLAP_SCORE_WEIGHT", env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0, default=1.0,
help=( help=(
"KV Router: Weight for overlap score in worker selection. " "KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse." "Higher values prioritize KV cache reuse."
), ),
arg_type=float, arg_type=float,
dest="kv_overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
) )
add_argument( add_argument(
g, g,
...@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup): ...@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup):
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--kv-events", flag_name="--router-kv-events",
env_var="DYN_KV_EVENTS", env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True, default=True,
help=( help=(
"KV Router: Enable/disable KV events. Use --kv-events to enable " "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-kv-events " "(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)." "to disable (router predicts cache state based on routing decisions)."
), ),
dest="use_kv_events", dest="use_kv_events",
obsolete_flag="--kv-events",
) )
add_argument( add_argument(
g, g,
flag_name="--router-ttl", flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL", env_var="DYN_ROUTER_TTL_SECS",
default=120.0, default=120.0,
help=( help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. " "KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=float, arg_type=float,
dest="router_ttl",
obsolete_flag="--router-ttl",
) )
add_argument( add_argument(
g, g,
...@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup): ...@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup):
default=2**20, default=2**20,
help=( help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. " "KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=int, arg_type=int,
) )
...@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup): ...@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup):
default=0.8, default=0.8,
help=( help=(
"KV Router: Target size ratio after pruning when KV events are disabled. " "KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=float, arg_type=float,
) )
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-replica-sync", flag_name="--router-replica-sync",
...@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup): ...@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup):
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--durable-kv-events", flag_name="--router-durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS", env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False, default=False,
help=( help=(
"KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. " "KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. "
...@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup): ...@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup):
"local_indexer mode. Use this flag when you need durability and multi-replica " "local_indexer mode. Use this flag when you need durability and multi-replica "
"consistency. Requires NATS with JetStream enabled." "consistency. Requires NATS with JetStream enabled."
), ),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--track-active-blocks", flag_name="--router-track-active-blocks",
env_var="DYN_TRACK_ACTIVE_BLOCKS", env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True, default=True,
dest="router_track_active_blocks", dest="router_track_active_blocks",
help=( help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). " "KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. " "By default, active blocks are tracked for load balancing. "
), ),
obsolete_flag="--track-active-blocks",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--assume-kv-reuse", flag_name="--router-assume-kv-reuse",
env_var="DYN_ASSUME_KV_REUSE", env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True, default=True,
dest="router_assume_kv_reuse", dest="router_assume_kv_reuse",
help=( help=(
"KV Router: When tracking active blocks, assume KV cache reuse. " "KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)." "Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
), ),
obsolete_flag="--assume-kv-reuse",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--track-output-blocks", flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS", env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False, default=False,
dest="router_track_output_blocks", dest="router_track_output_blocks",
...@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup): ...@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup):
"placeholder blocks as tokens are generated and applies fractional decay based on " "placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens." "progress toward expected_output_tokens."
), ),
obsolete_flag="--track-output-blocks",
) )
add_argument( add_argument(
g, g,
......
...@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift ...@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways: Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When `--track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. 1. **Output block tracking**: When `--router-track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions. 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
```json ```json
......
...@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) | | `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--kv-events` / `--no-kv-events` | `--kv-events` | Enable/disable real-time KV event tracking | | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT). |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p ...@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` | | `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` | | `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific | | `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` | | `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--kv-overlap-score-weight` | `DYN_KV_OVERLAP_SCORE_WEIGHT` | `1.0` | | `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples). For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
......
...@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if ...@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways: Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When output block tracking is enabled (frontend: `--track-output-blocks`; standalone router: `--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests. 1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions. 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional) - **Type**: `u32` (optional)
- **Requires**: `--track-output-blocks` (frontend) or `--router-track-output-blocks` (standalone router) for output block tracking behavior - **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
### Example ### Example
......
...@@ -135,7 +135,7 @@ spec: ...@@ -135,7 +135,7 @@ spec:
value: kv value: kv
- name: DYN_ROUTER_TEMPERATURE - name: DYN_ROUTER_TEMPERATURE
value: "0.5" # Add some randomness to prevent worker saturation value: "0.5" # Add some randomness to prevent worker saturation
- name: DYN_KV_OVERLAP_SCORE_WEIGHT - name: DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT
value: "1.5" # Prioritize TTFT over ITL value: "1.5" # Prioritize TTFT over ITL
- name: DYN_KV_CACHE_BLOCK_SIZE - name: DYN_KV_CACHE_BLOCK_SIZE
value: "16" value: "16"
......
...@@ -36,8 +36,8 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -36,8 +36,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) | | `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--kv-events` / `--no-kv-events` | `--kv-events` | Enable/disable real-time KV event tracking | | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT). |
| `--router-queue-threshold <float>` | None (disabled) | Queue threshold fraction; enables priority scheduling via `latency_sensitivity` | | `--router-queue-threshold <float>` | None (disabled) | Queue threshold fraction; enables priority scheduling via `latency_sensitivity` |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -78,8 +78,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p ...@@ -78,8 +78,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` | | `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` | | `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific | | `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` | | `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--kv-overlap-score-weight` | `DYN_KV_OVERLAP_SCORE_WEIGHT` | `1.0` | | `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples). For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
For A/B testing and advanced K8s setup, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md). For A/B testing and advanced K8s setup, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
...@@ -142,27 +142,27 @@ When KV blocks are created or removed, the engine notifies the Dynamo router, wh ...@@ -142,27 +142,27 @@ When KV blocks are created or removed, the engine notifies the Dynamo router, wh
To evaluate the benefits of KV-aware routing, compare your workload's performance using `--router-mode random|round-robin` against KV-aware routing. To evaluate the benefits of KV-aware routing, compare your workload's performance using `--router-mode random|round-robin` against KV-aware routing.
The main KV-aware routing arguments: The main KV-aware routing arguments (frontend uses the same `--router-*` flag names as the standalone router; legacy names without the prefix are obsolete):
- `--kv-overlap-score-weight`: Controls the importance of prefix cache overlaps in prefill cost calculations. Higher values improve Time To First Token (TTFT) at the cost of Inter-Token Latency (ITL). When set to 0, the router ignores prefix caches and uses pure load balancing. Defaults to 1. - `--router-kv-overlap-score-weight`: Controls the importance of prefix cache overlaps in prefill cost calculations. Higher values improve Time To First Token (TTFT) at the cost of Inter-Token Latency (ITL). When set to 0, the router ignores prefix caches and uses pure load balancing. Defaults to 1. .
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness. - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events). - `--no-router-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
- `--durable-kv-events`: Enables JetStream mode for KV event transport. Must be specified on **both** the frontend **and** all workers. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane. - `--router-durable-kv-events`: Enables JetStream mode for KV event transport. Must be specified on **both** the frontend **and** all workers. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane. .
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly. - `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
- `--router-reset-states`: Only applies in JetStream mode (`--durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate. - `--router-reset-states`: Only applies in JetStream mode (`--router-durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-snapshot-threshold`: Only applies in JetStream mode (`--durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart. - `--router-snapshot-threshold`: Only applies in JetStream mode (`--router-durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--no-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management. - `--no-router-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management. .
- `--track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward the expected output sequence length (`agent_hints.osl` in nvext). This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth. - `--router-track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward the expected output sequence length (`agent_hints.osl` in nvext). This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth.
- `--no-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist. - `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity. When set, the router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `latency_sensitivity` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. If not set (default), queueing is disabled and requests are dispatched immediately. - `--router-queue-threshold`: Queue threshold fraction for prefill token capacity. When set, the router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `latency_sensitivity` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. If not set (default), queueing is disabled and requests are dispatched immediately.
...@@ -172,31 +172,31 @@ The main KV-aware routing arguments: ...@@ -172,31 +172,31 @@ The main KV-aware routing arguments:
- `--active-prefill-tokens-threshold-frac`: Fraction of `max_num_batched_tokens` for busy detection. A worker is marked busy when `active_prefill_tokens > frac * max_num_batched_tokens`. Uses OR logic with `--active-prefill-tokens-threshold` (worker is busy if either threshold is exceeded). If not set, fractional busy detection is disabled. - `--active-prefill-tokens-threshold-frac`: Fraction of `max_num_batched_tokens` for busy detection. A worker is marked busy when `active_prefill_tokens > frac * max_num_batched_tokens`. Uses OR logic with `--active-prefill-tokens-threshold` (worker is busy if either threshold is exceeded). If not set, fractional busy detection is disabled.
- `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate. - `--router-ttl-secs`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-router-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-kv-events` is used. This prevents unbounded memory growth in long-running deployments. - `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-router-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
- `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-kv-events` is used. This creates headroom before the next pruning cycle. - `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-router-kv-events` is used. This creates headroom before the next pruning cycle.
- `--router-event-threads`: Number of event processing threads for the KV indexer. When set to 1 (default), the router uses a single-threaded radix tree with channel-based event processing, which supports TTL-based expiration and pruning. When set to a value greater than 1, the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. Note: the concurrent indexer does not support TTL/pruning (`--router-ttl`, `--router-max-tree-size`, `--router-prune-target-ratio` are ignored when `--router-event-threads > 1`). Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../../lib/kv-router/README.md). - `--router-event-threads`: Number of event processing threads for the KV indexer. When set to 1 (default), the router uses a single-threaded radix tree with channel-based event processing, which supports TTL-based expiration and pruning. When set to a value greater than 1, the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. Note: the concurrent indexer does not support TTL/pruning (`--router-ttl-secs`, `--router-max-tree-size`, `--router-prune-target-ratio` are ignored when `--router-event-threads > 1`). Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../../lib/kv-router/README.md).
>[!Note] >[!Note]
> **State persistence** depends on the event transport mode: > **State persistence** depends on the event transport mode:
> - **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes. > - **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes.
> - **JetStream mode** (`--durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots. > - **JetStream mode** (`--router-durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots.
> - **No KV events** (`--no-kv-events`): State persistence is not supported. > - **No KV events** (`--no-router-kv-events`): State persistence is not supported.
> >
> **Request plane is independent of KV event transport.** > **Request plane is independent of KV event transport.**
> The request plane (`DYN_REQUEST_PLANE` / `--request-plane`) controls how requests reach workers (TCP/HTTP/NATS), while KV events travel over a separate path. KV events use NATS in JetStream or NATS Core modes, or ZMQ when `--event-plane zmq` is set. With `--event-plane zmq` and `--discovery-backend file` or `mem`, the router can run entirely without etcd or NATS. When using a NATS-based event plane (the default), NATS is initialized automatically; set `NATS_SERVER=nats://...` to override the default `localhost:4222`. Use `--no-kv-events` to disable KV event transport entirely. > The request plane (`DYN_REQUEST_PLANE` / `--request-plane`) controls how requests reach workers (TCP/HTTP/NATS), while KV events travel over a separate path. KV events use NATS in JetStream or NATS Core modes, or ZMQ when `--event-plane zmq` is set. With `--event-plane zmq` and `--discovery-backend file` or `mem`, the router can run entirely without etcd or NATS. When using a NATS-based event plane (the default), NATS is initialized automatically; set `NATS_SERVER=nats://...` to override the default `localhost:4222`. Use `--no-router-kv-events` to disable KV event transport entirely.
> >
> When `--kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning. > When `--router-kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-router-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning.
> >
> **Backend Configuration:** When using `--no-kv-events`, configure your backend workers to disable KV event publishing: > **Backend Configuration:** When using `--no-router-kv-events`, configure your backend workers to disable KV event publishing:
> - **vLLM**: Use `--kv-events-config '{"enable_kv_cache_events": false}'` > - **vLLM**: Use `--kv-events-config '{"enable_kv_cache_events": false}'`
> - **SGLang**: Do not use `--kv-events-config` > - **SGLang**: Do not use `--kv-events-config`
> - **TRT-LLM**: Do not use `--publish-events-and-metrics` > - **TRT-LLM**: Do not use `--publish-events-and-metrics`
> >
> The cli args `--router-ttl`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When KV events are enabled (default), the router relies on worker-side eviction events and these parameters are ignored. > The cli args `--router-ttl-secs`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When KV events are enabled (default), the router relies on worker-side eviction events and these parameters are ignored.
> >
> **Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`. > **Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`.
...@@ -230,8 +230,8 @@ For custom routing logic and advanced patterns, see [Routing Patterns](router-ex ...@@ -230,8 +230,8 @@ For custom routing logic and advanced patterns, see [Routing Patterns](router-ex
### 1. Understand Your Workload Characteristics ### 1. Understand Your Workload Characteristics
- **Prefill-heavy workloads** (long prompts, short generations): Increase `kv-overlap-score-weight` - **Prefill-heavy workloads** (long prompts, short generations): Increase `--router-kv-overlap-score-weight`
- **Decode-heavy workloads** (short prompts, long generations): Decrease `kv-overlap-score-weight` - **Decode-heavy workloads** (short prompts, long generations): Decrease `--router-kv-overlap-score-weight`
### 2. Monitor Key Metrics ### 2. Monitor Key Metrics
...@@ -257,7 +257,7 @@ The `router_temperature` parameter controls routing randomness: ...@@ -257,7 +257,7 @@ The `router_temperature` parameter controls routing randomness:
1. Begin with default settings 1. Begin with default settings
2. Monitor TTFT and ITL metrics 2. Monitor TTFT and ITL metrics
3. Adjust `kv-overlap-score-weight` to meet your performance goals: 3. Adjust `--router-kv-overlap-score-weight` to meet your performance goals:
- To reduce TTFT: Increase the weight - To reduce TTFT: Increase the weight
- To reduce ITL: Decrease the weight - To reduce ITL: Decrease the weight
4. If you observe severe load imbalance, increase the temperature setting 4. If you observe severe load imbalance, increase the temperature setting
...@@ -349,7 +349,7 @@ For improved fault tolerance, you can launch multiple frontend + router replicas ...@@ -349,7 +349,7 @@ For improved fault tolerance, you can launch multiple frontend + router replicas
The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details): The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details):
1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - in local indexer mode (default) state is rebuilt from workers on startup; in JetStream mode (`--durable-kv-events`) it is backed by JetStream events and object store snapshots. 1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - in local indexer mode (default) state is rebuilt from workers on startup; in JetStream mode (`--router-durable-kv-events`) it is backed by JetStream events and object store snapshots.
2. **Active blocks (decoding blocks)**: Tracks blocks currently being used for active generation requests. This state is **ephemeral** - when a new router replica starts, it begins with zero active block knowledge but becomes eventually consistent as it handles requests. 2. **Active blocks (decoding blocks)**: Tracks blocks currently being used for active generation requests. This state is **ephemeral** - when a new router replica starts, it begins with zero active block knowledge but becomes eventually consistent as it handles requests.
...@@ -380,7 +380,7 @@ Persistence behavior depends on which event transport mode is active: ...@@ -380,7 +380,7 @@ Persistence behavior depends on which event transport mode is active:
- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered - Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered
- Simpler infrastructure (no JetStream required) - Simpler infrastructure (no JetStream required)
**JetStream Mode** (`--durable-kv-events` on **both** frontend **and** workers)**:** **JetStream Mode** (`--router-durable-kv-events` on **both** frontend **and** workers):**
- Prefix blocks are stored in NATS JetStream with 1-hour retention - Prefix blocks are stored in NATS JetStream with 1-hour retention
- Snapshots saved to NATS object store at configurable thresholds - Snapshots saved to NATS object store at configurable thresholds
- New replicas automatically restore this state on startup - New replicas automatically restore this state on startup
......
...@@ -27,8 +27,8 @@ python -m dynamo.global_router \ ...@@ -27,8 +27,8 @@ python -m dynamo.global_router \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \ DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
--endpoint prefill_pool_0.worker.generate \ --endpoint prefill_pool_0.worker.generate \
--block-size 16 \ --router-block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks --no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
...@@ -41,8 +41,8 @@ python -m dynamo.mocker \ ...@@ -41,8 +41,8 @@ python -m dynamo.mocker \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \ DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
--endpoint prefill_pool_1.worker.generate \ --endpoint prefill_pool_1.worker.generate \
--block-size 16 \ --router-block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks --no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
...@@ -55,8 +55,8 @@ python -m dynamo.mocker \ ...@@ -55,8 +55,8 @@ python -m dynamo.mocker \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \ DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
--endpoint decode_pool_0.worker.generate \ --endpoint decode_pool_0.worker.generate \
--block-size 16 \ --router-block-size 16 \
--kv-overlap-score-weight 0 & --router-kv-overlap-score-weight 0 &
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
......
...@@ -141,9 +141,9 @@ spec: ...@@ -141,9 +141,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate - ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate
- --block-size - --router-block-size
- "16" - "16"
- --no-track-active-blocks - --no-router-track-active-blocks
command: command:
- python - python
- -m - -m
...@@ -206,9 +206,9 @@ spec: ...@@ -206,9 +206,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate - ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate
- --block-size - --router-block-size
- "16" - "16"
- --no-track-active-blocks - --no-router-track-active-blocks
command: command:
- python - python
- -m - -m
...@@ -271,9 +271,9 @@ spec: ...@@ -271,9 +271,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-decode-pool-0.backend.generate - ${K8S_NAMESPACE}-decode-pool-0.backend.generate
- --block-size - --router-block-size
- "16" - "16"
- --kv-overlap-score-weight - --router-kv-overlap-score-weight
- "0" - "0"
command: command:
- python - python
......
...@@ -46,7 +46,7 @@ spec: ...@@ -46,7 +46,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --router-kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-router-kv-events"
Planner: Planner:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: planner componentType: planner
......
...@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess): ...@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess):
) )
if durable_kv_events: if durable_kv_events:
command.append("--durable-kv-events") command.append("--router-durable-kv-events")
env = os.environ.copy() env = os.environ.copy()
env["DYN_REQUEST_PLANE"] = request_plane env["DYN_REQUEST_PLANE"] = request_plane
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment