refactor: update frontend kv-router flags to be consistent with router (#6361)

44a76f96 · jh-nv · GitHub · b2075619 · 44a76f96 · 44a76f96
Unverified Commit 44a76f96 authored Feb 18, 2026 by jh-nv Committed by GitHub Feb 19, 2026
10 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -121,6 +121,18 @@ class FrontendArgGroup(ArgGroup):
            help="Interactive text chat.\nenv var: DYN_INTERACTIVE",
        )

+        add_argument(
+            g,
+            flag_name="--namespace",
+            env_var="DYN_NAMESPACE",
+            default=None,
+            help=(
+                "Dynamo namespace for model discovery scoping. If specified, models will "
+                "only be discovered from this namespace. If not specified, discovers models "
+                "from all namespaces (global discovery)."
+            ),
+        )
+
        add_argument(
            g,
            flag_name="--kv-cache-block-size",
@@ -172,14 +184,16 @@ class FrontendArgGroup(ArgGroup):
        )
        add_argument(
            g,
-            flag_name="--kv-overlap-score-weight",
-            env_var="DYN_KV_OVERLAP_SCORE_WEIGHT",
+            flag_name="--router-kv-overlap-score-weight",
+            env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
            default=1.0,
            help=(
                "KV Router: Weight for overlap score in worker selection. "
                "Higher values prioritize KV cache reuse."
            ),
            arg_type=float,
+            dest="kv_overlap_score_weight",
+            obsolete_flag="--kv-overlap-score-weight",
        )
        add_argument(
            g,
@@ -194,26 +208,29 @@ class FrontendArgGroup(ArgGroup):
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--kv-events",
-            env_var="DYN_KV_EVENTS",
+            flag_name="--router-kv-events",
+            env_var="DYN_ROUTER_USE_KV_EVENTS",
            default=True,
            help=(
-                "KV Router: Enable/disable KV events. Use --kv-events to enable "
-                "(default, router receives cache state events from workers) or --no-kv-events "
+                "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
+                "(default, router receives cache state events from workers) or --no-router-kv-events "
                "to disable (router predicts cache state based on routing decisions)."
            ),
            dest="use_kv_events",
+            obsolete_flag="--kv-events",
        )
        add_argument(
            g,
-            flag_name="--router-ttl",
-            env_var="DYN_ROUTER_TTL",
+            flag_name="--router-ttl-secs",
+            env_var="DYN_ROUTER_TTL_SECS",
            default=120.0,
            help=(
                "KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
-                "Only used when --no-kv-events is set."
+                "Only used when --no-router-kv-events is set."
            ),
            arg_type=float,
+            dest="router_ttl",
+            obsolete_flag="--router-ttl",
        )
        add_argument(
            g,
@@ -222,7 +239,7 @@ class FrontendArgGroup(ArgGroup):
            default=2**20,
            help=(
                "KV Router: Maximum tree size before pruning when KV events are disabled. "
-                "Only used when --no-kv-events is set."
+                "Only used when --no-router-kv-events is set."
            ),
            arg_type=int,
        )
@@ -233,23 +250,11 @@ class FrontendArgGroup(ArgGroup):
            default=0.8,
            help=(
                "KV Router: Target size ratio after pruning when KV events are disabled. "
-                "Only used when --no-kv-events is set."
+                "Only used when --no-router-kv-events is set."
            ),
            arg_type=float,
        )

-        add_argument(
-            g,
-            flag_name="--namespace",
-            env_var="DYN_NAMESPACE",
-            default=None,
-            help=(
-                "Dynamo namespace for model discovery scoping. If specified, models will "
-                "only be discovered from this namespace. If not specified, discovers models "
-                "from all namespaces (global discovery)."
-            ),
-        )
-
        add_negatable_bool_argument(
            g,
            flag_name="--router-replica-sync",
@@ -284,8 +289,8 @@ class FrontendArgGroup(ArgGroup):
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--durable-kv-events",
-            env_var="DYN_DURABLE_KV_EVENTS",
+            flag_name="--router-durable-kv-events",
+            env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
            default=False,
            help=(
                "KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. "
@@ -293,32 +298,36 @@ class FrontendArgGroup(ArgGroup):
                "local_indexer mode. Use this flag when you need durability and multi-replica "
                "consistency. Requires NATS with JetStream enabled."
            ),
+            dest="durable_kv_events",
+            obsolete_flag="--durable-kv-events",
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--track-active-blocks",
-            env_var="DYN_TRACK_ACTIVE_BLOCKS",
+            flag_name="--router-track-active-blocks",
+            env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
            default=True,
            dest="router_track_active_blocks",
            help=(
                "KV Router: Track active blocks (blocks being used for ongoing generation). "
                "By default, active blocks are tracked for load balancing. "
            ),
+            obsolete_flag="--track-active-blocks",
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--assume-kv-reuse",
-            env_var="DYN_ASSUME_KV_REUSE",
+            flag_name="--router-assume-kv-reuse",
+            env_var="DYN_ROUTER_ASSUME_KV_REUSE",
            default=True,
            dest="router_assume_kv_reuse",
            help=(
                "KV Router: When tracking active blocks, assume KV cache reuse. "
-                "Use --no-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
+                "Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
            ),
+            obsolete_flag="--assume-kv-reuse",
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--track-output-blocks",
+            flag_name="--router-track-output-blocks",
            env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
            default=False,
            dest="router_track_output_blocks",
@@ -327,6 +336,7 @@ class FrontendArgGroup(ArgGroup):
                "placeholder blocks as tokens are generated and applies fractional decay based on "
                "progress toward expected_output_tokens."
            ),
+            obsolete_flag="--track-output-blocks",
        )
        add_argument(
            g,

--- a/docs/pages/components/frontend/nvext.md
+++ b/docs/pages/components/frontend/nvext.md
@@ -81,7 +81,7 @@ When `--router-queue-threshold` is set and the queue is active, this value shift

 Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:

-1. **Output block tracking**: When `--track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
+1. **Output block tracking**: When `--router-track-output-blocks` is enabled, the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`.
 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.

 ```json

--- a/docs/pages/components/router/README.md
+++ b/docs/pages/components/router/README.md
@@ -32,8 +32,8 @@ Backend workers register themselves using the `register_model` API, after which
 | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
 | `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
 | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
-| `--kv-events` / `--no-kv-events` | `--kv-events` | Enable/disable real-time KV event tracking |
-| `--kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
+| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
+| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT). |

 For all available options: `python -m dynamo.frontend --help`

@@ -71,8 +71,8 @@ All CLI arguments can be configured via environment variables using the `DYN_` p
 | `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
 | `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
 | `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
-| `--no-kv-events` | `DYN_KV_EVENTS=false` | `true` |
-| `--kv-overlap-score-weight` | `DYN_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
+| `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
+| `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |

 For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).


--- a/docs/pages/components/router/agent-hints.md
+++ b/docs/pages/components/router/agent-hints.md
@@ -49,11 +49,11 @@ A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if

 Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:

-1. **Output block tracking**: When output block tracking is enabled (frontend: `--track-output-blocks`; standalone router: `--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
+1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
 2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.

 - **Type**: `u32` (optional)
- **Requires**: `--track-output-blocks` (frontend) or `--router-track-output-blocks` (standalone router) for output block tracking behavior
+- **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior

 ### Example


--- a/docs/pages/components/router/router-examples.md
+++ b/docs/pages/components/router/router-examples.md
@@ -135,7 +135,7 @@ spec:
          value: kv
        - name: DYN_ROUTER_TEMPERATURE
          value: "0.5"  # Add some randomness to prevent worker saturation
-        - name: DYN_KV_OVERLAP_SCORE_WEIGHT
+        - name: DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT
          value: "1.5"  # Prioritize TTFT over ITL
        - name: DYN_KV_CACHE_BLOCK_SIZE
          value: "16"

--- a/docs/pages/components/router/router-guide.md
+++ b/docs/pages/components/router/router-guide.md
--- a/examples/hierarchical_planner/run_example.sh
+++ b/examples/hierarchical_planner/run_example.sh
@@ -27,8 +27,8 @@ python -m dynamo.global_router \
 # ============================================================================
 DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
  --endpoint prefill_pool_0.worker.generate \
-  --block-size 16 \
-  --no-track-active-blocks &  # prefill router does not need to track active blocks
+  --router-block-size 16 \
+  --no-router-track-active-blocks &  # prefill router does not need to track active blocks

 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
@@ -41,8 +41,8 @@ python -m dynamo.mocker \
 # ============================================================================
 DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
  --endpoint prefill_pool_1.worker.generate \
-  --block-size 16 \
-  --no-track-active-blocks &  # prefill router does not need to track active blocks
+  --router-block-size 16 \
+  --no-router-track-active-blocks &  # prefill router does not need to track active blocks

 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
@@ -55,8 +55,8 @@ python -m dynamo.mocker \
 # ============================================================================
 DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
  --endpoint decode_pool_0.worker.generate \
-  --block-size 16 \
-  --kv-overlap-score-weight 0 &
+  --router-block-size 16 \
+  --router-kv-overlap-score-weight 0 &

 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \

--- a/examples/hierarchical_planner/vllm-2p1d.yaml
+++ b/examples/hierarchical_planner/vllm-2p1d.yaml
@@ -141,9 +141,9 @@ spec:
          args:
          - --endpoint
          - ${K8S_NAMESPACE}-prefill-pool-0.prefill.generate
-          - --block-size
+          - --router-block-size
          - "16"
-          - --no-track-active-blocks
+          - --no-router-track-active-blocks
          command:
          - python
          - -m
@@ -206,9 +206,9 @@ spec:
          args:
          - --endpoint
          - ${K8S_NAMESPACE}-prefill-pool-1.prefill.generate
-          - --block-size
+          - --router-block-size
          - "16"
-          - --no-track-active-blocks
+          - --no-router-track-active-blocks
          command:
          - python
          - -m
@@ -271,9 +271,9 @@ spec:
          args:
          - --endpoint
          - ${K8S_NAMESPACE}-decode-pool-0.backend.generate
-          - --block-size
+          - --router-block-size
          - "16"
-          - --kv-overlap-score-weight
+          - --router-kv-overlap-score-weight
          - "0"
          command:
          - python

--- a/tests/planner/perf_test_configs/disagg_8b_planner.yaml
+++ b/tests/planner/perf_test_configs/disagg_8b_planner.yaml
@@ -46,7 +46,7 @@ spec:
            - /bin/sh
            - -c
          args:
-            - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-kv-events"
+            - "python3 -m dynamo.frontend --http-port 8000 --kv-cache-block-size 128 --router-mode kv --router-kv-overlap-score-weight 0.0 --router-temperature 0.0 --no-router-kv-events"
    Planner:
      envFromSecret: hf-token-secret
      componentType: planner

--- a/tests/router/common.py
+++ b/tests/router/common.py
@@ -83,7 +83,7 @@ class KVRouterProcess(ManagedProcess):
            )

        if durable_kv_events:
-            command.append("--durable-kv-events")
+            command.append("--router-durable-kv-events")

        env = os.environ.copy()
        env["DYN_REQUEST_PLANE"] = request_plane