Unverified Commit 44a76f96 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

refactor: update frontend kv-router flags to be consistent with router (#6361)

parent b2075619
......@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup):
help="Interactive text chat.\nenv var: DYN_INTERACTIVE",
)
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_argument(
g,
flag_name="--kv-cache-block-size",
......@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup):
)
add_argument(
g,
flag_name="--kv-overlap-score-weight",
env_var="DYN_KV_OVERLAP_SCORE_WEIGHT",
flag_name="--router-kv-overlap-score-weight",
env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0,
help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float,
dest="kv_overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
)
add_argument(
g,
......@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup):
)
add_negatable_bool_argument(
g,
flag_name="--kv-events",
env_var="DYN_KV_EVENTS",
flag_name="--router-kv-events",
env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True,
help=(
"KV Router: Enable/disable KV events. Use --kv-events to enable "
"(default, router receives cache state events from workers) or --no-kv-events "
"KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
obsolete_flag="--kv-events",
)
add_argument(
g,
flag_name="--router-ttl",
env_var="DYN_ROUTER_TTL",
flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL_SECS",
default=120.0,
help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-kv-events is set."
"Only used when --no-router-kv-events is set."
),
arg_type=float,
dest="router_ttl",
obsolete_flag="--router-ttl",
)
add_argument(
g,
......@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup):
default=2**20,
help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-kv-events is set."
"Only used when --no-router-kv-events is set."
),
arg_type=int,
)
......@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup):
default=0.8,
help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-kv-events is set."
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_negatable_bool_argument(
g,
flag_name="--router-replica-sync",
......@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup):
)
add_negatable_bool_argument(
g,
flag_name="--durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS",
flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False,
help=(
"KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. "
......@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup):
"local_indexer mode. Use this flag when you need durability and multi-replica "
"consistency. Requires NATS with JetStream enabled."
),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
)
add_negatable_bool_argument(
g,
flag_name="--track-active-blocks",
env_var="DYN_TRACK_ACTIVE_BLOCKS",
flag_name="--router-track-active-blocks",
env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True,
dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. "
),
obsolete_flag="--track-active-blocks",
)
add_negatable_bool_argument(
g,
flag_name="--assume-kv-reuse",
env_var="DYN_ASSUME_KV_REUSE",
flag_name="--router-assume-kv-reuse",
env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True,
dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
"Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
),
obsolete_flag="--assume-kv-reuse",
)
add_negatable_bool_argument(
g,
flag_name="--track-output-blocks",
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
......@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup):
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens."
),
obsolete_flag="--track-output-blocks",
)
add_argument(
g,
......
......@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When `--track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
1. **Output block tracking**: When `--router-track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
```json
......
......@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--kv-events` / `--no-kv-events` | `--kv-events` | Enable/disable real-time KV event tracking |
| `--kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT). |
For all available options: `python -m dynamo.frontend --help`
......@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` |
| `--kv-overlap-score-weight` | `DYN_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
| `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
......
......@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When output block tracking is enabled (frontend: `--track-output-blocks`; standalone router: `--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional)
- **Requires**: `--track-output-blocks` (frontend) or `--router-track-output-blocks` (standalone router) for output block tracking behavior
- **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
### Example
......
......@@ -135,7 +135,7 @@ spec:
value: kv
- name: DYN_ROUTER_TEMPERATURE
value: "0.5" # Add some randomness to prevent worker saturation
- name: DYN_KV_OVERLAP_SCORE_WEIGHT
- name: DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT
value: "1.5" # Prioritize TTFT over ITL
- name: DYN_KV_CACHE_BLOCK_SIZE
value: "16"
......
......@@ -27,8 +27,8 @@ python -m dynamo.global_router \
# ============================================================================
DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
--endpoint prefill_pool_0.worker.generate \
--block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks
--router-block-size 16 \
--no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
......@@ -41,8 +41,8 @@ python -m dynamo.mocker \
# ============================================================================
DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
--endpoint prefill_pool_1.worker.generate \
--block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks
--router-block-size 16 \
--no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
......@@ -55,8 +55,8 @@ python -m dynamo.mocker \
# ============================================================================
DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
--endpoint decode_pool_0.worker.generate \
--block-size 16 \
--kv-overlap-score-weight 0 &
--router-block-size 16 \
--router-kv-overlap-score-weight 0 &
python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \
......
......@@ -141,9 +141,9 @@ spec:
args:
- --endpoint
- ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate
- --block-size
- --router-block-size
- "16"
- --no-track-active-blocks
- --no-router-track-active-blocks
command:
- python
- -m
......@@ -206,9 +206,9 @@ spec:
args:
- --endpoint
- ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate
- --block-size
- --router-block-size
- "16"
- --no-track-active-blocks
- --no-router-track-active-blocks
command:
- python
- -m
......@@ -271,9 +271,9 @@ spec:
args:
- --endpoint
- ${K8S_NAMESPACE}-decode-pool-0.backend.generate
- --block-size
- --router-block-size
- "16"
- --kv-overlap-score-weight
- --router-kv-overlap-score-weight
- "0"
command:
- python
......
......@@ -46,7 +46,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events"
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --router-kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-router-kv-events"
Planner:
envFromSecret: hf-token-secret
componentType: planner
......
......@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess):
)
if durable_kv_events:
command.append("--durable-kv-events")
command.append("--router-durable-kv-events")
env = os.environ.copy()
env["DYN_REQUEST_PLANE"] = request_plane
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment