Unverified Commit 44a76f96 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

refactor: update frontend kv-router flags to be consistent with router (#6361)

parent b2075619
...@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup): ...@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup):
help="Interactive text chat.\nenv var: DYN_INTERACTIVE", help="Interactive text chat.\nenv var: DYN_INTERACTIVE",
) )
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_argument( add_argument(
g, g,
flag_name="--kv-cache-block-size", flag_name="--kv-cache-block-size",
...@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup): ...@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup):
) )
add_argument( add_argument(
g, g,
flag_name="--kv-overlap-score-weight", flag_name="--router-kv-overlap-score-weight",
env_var="DYN_KV_OVERLAP_SCORE_WEIGHT", env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0, default=1.0,
help=( help=(
"KV Router: Weight for overlap score in worker selection. " "KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse." "Higher values prioritize KV cache reuse."
), ),
arg_type=float, arg_type=float,
dest="kv_overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
) )
add_argument( add_argument(
g, g,
...@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup): ...@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup):
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--kv-events", flag_name="--router-kv-events",
env_var="DYN_KV_EVENTS", env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True, default=True,
help=( help=(
"KV Router: Enable/disable KV events. Use --kv-events to enable " "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-kv-events " "(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)." "to disable (router predicts cache state based on routing decisions)."
), ),
dest="use_kv_events", dest="use_kv_events",
obsolete_flag="--kv-events",
) )
add_argument( add_argument(
g, g,
flag_name="--router-ttl", flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL", env_var="DYN_ROUTER_TTL_SECS",
default=120.0, default=120.0,
help=( help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. " "KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=float, arg_type=float,
dest="router_ttl",
obsolete_flag="--router-ttl",
) )
add_argument( add_argument(
g, g,
...@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup): ...@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup):
default=2**20, default=2**20,
help=( help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. " "KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=int, arg_type=int,
) )
...@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup): ...@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup):
default=0.8, default=0.8,
help=( help=(
"KV Router: Target size ratio after pruning when KV events are disabled. " "KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-kv-events is set." "Only used when --no-router-kv-events is set."
), ),
arg_type=float, arg_type=float,
) )
add_argument(
g,
flag_name="--namespace",
env_var="DYN_NAMESPACE",
default=None,
help=(
"Dynamo namespace for model discovery scoping. If specified, models will "
"only be discovered from this namespace. If not specified, discovers models "
"from all namespaces (global discovery)."
),
)
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-replica-sync", flag_name="--router-replica-sync",
...@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup): ...@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup):
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--durable-kv-events", flag_name="--router-durable-kv-events",
env_var="DYN_DURABLE_KV_EVENTS", env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False, default=False,
help=( help=(
"KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. " "KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. "
...@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup): ...@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup):
"local_indexer mode. Use this flag when you need durability and multi-replica " "local_indexer mode. Use this flag when you need durability and multi-replica "
"consistency. Requires NATS with JetStream enabled." "consistency. Requires NATS with JetStream enabled."
), ),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--track-active-blocks", flag_name="--router-track-active-blocks",
env_var="DYN_TRACK_ACTIVE_BLOCKS", env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True, default=True,
dest="router_track_active_blocks", dest="router_track_active_blocks",
help=( help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). " "KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. " "By default, active blocks are tracked for load balancing. "
), ),
obsolete_flag="--track-active-blocks",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--assume-kv-reuse", flag_name="--router-assume-kv-reuse",
env_var="DYN_ASSUME_KV_REUSE", env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True, default=True,
dest="router_assume_kv_reuse", dest="router_assume_kv_reuse",
help=( help=(
"KV Router: When tracking active blocks, assume KV cache reuse. " "KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)." "Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
), ),
obsolete_flag="--assume-kv-reuse",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--track-output-blocks", flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS", env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False, default=False,
dest="router_track_output_blocks", dest="router_track_output_blocks",
...@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup): ...@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup):
"placeholder blocks as tokens are generated and applies fractional decay based on " "placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens." "progress toward expected_output_tokens."
), ),
obsolete_flag="--track-output-blocks",
) )
add_argument( add_argument(
g, g,
......
...@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift ...@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways: Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When `--track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. 1. **Output block tracking**: When `--router-track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions. 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
```json ```json
......
...@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) | | `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--kv-events` / `--no-kv-events` | `--kv-events` | Enable/disable real-time KV event tracking | | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT). |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p ...@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` | | `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` | | `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific | | `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` | | `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--kv-overlap-score-weight` | `DYN_KV_OVERLAP_SCORE_WEIGHT` | `1.0` | | `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples). For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
......
...@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if ...@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways: Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When output block tracking is enabled (frontend: `--track-output-blocks`; standalone router: `--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests. 1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions. 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional) - **Type**: `u32` (optional)
- **Requires**: `--track-output-blocks` (frontend) or `--router-track-output-blocks` (standalone router) for output block tracking behavior - **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
### Example ### Example
......
...@@ -135,7 +135,7 @@ spec: ...@@ -135,7 +135,7 @@ spec:
value: kv value: kv
- name: DYN_ROUTER_TEMPERATURE - name: DYN_ROUTER_TEMPERATURE
value: "0.5" # Add some randomness to prevent worker saturation value: "0.5" # Add some randomness to prevent worker saturation
- name: DYN_KV_OVERLAP_SCORE_WEIGHT - name: DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT
value: "1.5" # Prioritize TTFT over ITL value: "1.5" # Prioritize TTFT over ITL
- name: DYN_KV_CACHE_BLOCK_SIZE - name: DYN_KV_CACHE_BLOCK_SIZE
value: "16" value: "16"
......
...@@ -27,8 +27,8 @@ python -m dynamo.global_router \ ...@@ -27,8 +27,8 @@ python -m dynamo.global_router \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \ DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
--endpoint prefill_pool_0.worker.generate \ --endpoint prefill_pool_0.worker.generate \
--block-size 16 \ --router-block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks --no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
...@@ -41,8 +41,8 @@ python -m dynamo.mocker \ ...@@ -41,8 +41,8 @@ python -m dynamo.mocker \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \ DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
--endpoint prefill_pool_1.worker.generate \ --endpoint prefill_pool_1.worker.generate \
--block-size 16 \ --router-block-size 16 \
--no-track-active-blocks & # prefill router does not need to track active blocks --no-router-track-active-blocks & # prefill router does not need to track active blocks
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
...@@ -55,8 +55,8 @@ python -m dynamo.mocker \ ...@@ -55,8 +55,8 @@ python -m dynamo.mocker \
# ============================================================================ # ============================================================================
DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \ DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
--endpoint decode_pool_0.worker.generate \ --endpoint decode_pool_0.worker.generate \
--block-size 16 \ --router-block-size 16 \
--kv-overlap-score-weight 0 & --router-kv-overlap-score-weight 0 &
python -m dynamo.mocker \ python -m dynamo.mocker \
--model-path Qwen/Qwen3-0.6B \ --model-path Qwen/Qwen3-0.6B \
......
...@@ -141,9 +141,9 @@ spec: ...@@ -141,9 +141,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate - ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate
- --block-size - --router-block-size
- "16" - "16"
- --no-track-active-blocks - --no-router-track-active-blocks
command: command:
- python - python
- -m - -m
...@@ -206,9 +206,9 @@ spec: ...@@ -206,9 +206,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate - ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate
- --block-size - --router-block-size
- "16" - "16"
- --no-track-active-blocks - --no-router-track-active-blocks
command: command:
- python - python
- -m - -m
...@@ -271,9 +271,9 @@ spec: ...@@ -271,9 +271,9 @@ spec:
args: args:
- --endpoint - --endpoint
- ${K8S_NAMESPACE}-decode-pool-0.backend.generate - ${K8S_NAMESPACE}-decode-pool-0.backend.generate
- --block-size - --router-block-size
- "16" - "16"
- --kv-overlap-score-weight - --router-kv-overlap-score-weight
- "0" - "0"
command: command:
- python - python
......
...@@ -46,7 +46,7 @@ spec: ...@@ -46,7 +46,7 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
args: args:
- "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events" - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --router-kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-router-kv-events"
Planner: Planner:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: planner componentType: planner
......
...@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess): ...@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess):
) )
if durable_kv_events: if durable_kv_events:
command.append("--durable-kv-events") command.append("--router-durable-kv-events")
env = os.environ.copy() env = os.environ.copy()
env["DYN_REQUEST_PLANE"] = request_plane env["DYN_REQUEST_PLANE"] = request_plane
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment