chore(kv-router): deduplicate KvRouterConfig args into shared ArgGroup (#6805)

Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

chore(kv-router): deduplicate KvRouterConfig args into shared ArgGroup (#6805)
Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
9f3b7b33 · Yan Ru Pei · GitHub · 0da960f1 · 9f3b7b33 · 9f3b7b33
Unverified Commit 9f3b7b33 authored Mar 03, 2026 by Yan Ru Pei Committed by GitHub Mar 03, 2026
13 changed files
--- a/components/src/dynamo/common/configuration/groups/__init__.py
+++ b/components/src/dynamo/common/configuration/groups/__init__.py
@@ -3,6 +3,12 @@
 """ArgGroup implementations for different configuration domains."""
+from .kv_router_args import KvRouterArgGroup, KvRouterConfigBase
 from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig
-__all__ = ["DynamoRuntimeArgGroup", "DynamoRuntimeConfig"]
+__all__ = [
+    "DynamoRuntimeArgGroup",
+    "DynamoRuntimeConfig",
+    "KvRouterArgGroup",
+    "KvRouterConfigBase",
+]
--- a/components/src/dynamo/router/backend_args.py
+++ b/components/src/dynamo/router/backend_args.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-"""Dynamo standalone router configuration ArgGroup."""
+"""Shared KV router configuration ArgGroup.
+Defines the 16 KvRouterConfig parameters once so that both
+``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication.
+Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python
+constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be
+unpacked directly into ``KvRouterConfig(**config.kv_router_kwargs())``.
+"""
+from typing import Optional
 from dynamo.common.configuration.arg_group import ArgGroup
 from dynamo.common.configuration.config_base import ConfigBase
 from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
+# Authoritative field list — used by kv_router_kwargs() to extract values.
+_KV_ROUTER_FIELDS: tuple[str, ...] = (
+    "overlap_score_weight",
+    "router_temperature",
+    "use_kv_events",
+    "durable_kv_events",
+    "router_replica_sync",
+    "router_track_active_blocks",
+    "router_track_output_blocks",
+    "router_assume_kv_reuse",
+    "router_snapshot_threshold",
+    "router_reset_states",
+    "router_ttl_secs",
+    "router_max_tree_size",
+    "router_prune_target_ratio",
+    "router_queue_threshold",
+    "router_event_threads",
+    "router_enable_cache_control",
+)
-class DynamoRouterConfig(ConfigBase):
+class KvRouterConfigBase(ConfigBase):
-    """Typed configuration for the standalone KV router (router-owned options only)."""
+    """Mixin carrying the 16 KvRouterConfig fields."""
-    namespace: str
+    overlap_score_weight: float
-    endpoint: str
-    router_block_size: int
-    router_kv_overlap_score_weight: float
    router_temperature: float
-    router_use_kv_events: bool
+    use_kv_events: bool
+    durable_kv_events: bool
    router_replica_sync: bool
-    router_snapshot_threshold: int
-    router_reset_states: bool
-    router_durable_kv_events: bool
    router_track_active_blocks: bool
-    router_assume_kv_reuse: bool
    router_track_output_blocks: bool
+    router_assume_kv_reuse: bool
+    router_snapshot_threshold: int
+    router_reset_states: bool
    router_ttl_secs: float
    router_max_tree_size: int
    router_prune_target_ratio: float
+    router_queue_threshold: Optional[float]
    router_event_threads: int
+    router_enable_cache_control: bool
-    def validate(self) -> None:
+    def kv_router_kwargs(self) -> dict:
-        """Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
+        """Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
-        if not self.endpoint:
+        return {f: getattr(self, f) for f in _KV_ROUTER_FIELDS}
-            raise ValueError(
-                "endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
-            )
-        parts = self.endpoint.split(".")
-        if len(parts) != 3:
-            raise ValueError(
-                f"Invalid endpoint format: {self.endpoint!r}. "
-                "Expected format: namespace.component.endpoint"
-            )
-        self.namespace = parts[0]
-class DynamoRouterArgGroup(ArgGroup):
-    """CLI argument group for standalone router options."""
-    name = "dynamo-router"
+class KvRouterArgGroup(ArgGroup):
+    """CLI arguments for the 16 KvRouterConfig parameters."""
    def add_arguments(self, parser) -> None:
-        """Add router-owned arguments to parser."""
+        g = parser.add_argument_group("KV Router Options")
-        g = parser.add_argument_group("Dynamo Router Options")
-        add_argument(
-            g,
-            flag_name="--endpoint",
-            env_var="DYN_ROUTER_ENDPOINT",
-            default=None,
-            help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
-            arg_type=str,
-        )
-        add_argument(
-            g,
-            flag_name="--router-block-size",
-            env_var="DYN_ROUTER_BLOCK_SIZE",
-            default=128,
-            help="KV cache block size for routing decisions",
-            arg_type=int,
-            obsolete_flag="--block-size",
-        )
        add_argument(
            g,
            flag_name="--router-kv-overlap-score-weight",
            env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
            default=1.0,
-            help="KV Router: Weight for overlap score in worker selection. Higher values prioritize KV cache reuse",
+            help=(
+                "KV Router: Weight for overlap score in worker selection. "
+                "Higher values prioritize KV cache reuse."
+            ),
            arg_type=float,
+            dest="overlap_score_weight",
            obsolete_flag="--kv-overlap-score-weight",
        )
        add_argument(
            g,
            flag_name="--router-temperature",
            env_var="DYN_ROUTER_TEMPERATURE",
            default=0.0,
-            help="KV Router: Temperature for worker sampling via softmax. Higher values promote more randomness, and 0 fallbacks to deterministic.",
+            help=(
+                "KV Router: Temperature for worker sampling via softmax. Higher values "
+                "promote more randomness, and 0 fallbacks to deterministic."
+            ),
            arg_type=float,
        )
        add_negatable_bool_argument(
            g,
            flag_name="--router-kv-events",
            env_var="DYN_ROUTER_USE_KV_EVENTS",
            default=True,
-            help="KV Router: Enable KV events from workers. When disabled (--no-router-kv-events), the router predicts cache state based on routing decisions with TTL-based expiration and pruning, rather than receiving events from workers.",
+            help=(
-            dest="router_use_kv_events",
+                "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
+                "(default, router receives cache state events from workers) or --no-router-kv-events "
+                "to disable (router predicts cache state based on routing decisions)."
+            ),
+            dest="use_kv_events",
            obsolete_flag="--kv-events",
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--router-replica-sync",
+            flag_name="--router-durable-kv-events",
-            env_var="DYN_ROUTER_REPLICA_SYNC",
+            env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
-            default=False,
-            help="KV Router: Enable replica synchronization across multiple router instances. When true, routers will publish and subscribe to events to maintain consistent state.",
-        )
-        add_argument(
-            g,
-            flag_name="--router-snapshot-threshold",
-            env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
-            default=1000000,
-            help="KV Router: Number of messages in stream before triggering a snapshot",
-            arg_type=int,
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-reset-states",
-            env_var="DYN_ROUTER_RESET_STATES",
            default=False,
-            help="KV Router: Reset router state on startup, purging stream and object store. WARNING: Can affect existing router replicas.",
+            help=(
+                "[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
+                "This option will be removed in a future release. The event-plane subscriber "
+                "(local_indexer mode) is now the recommended path."
+            ),
+            dest="durable_kv_events",
+            obsolete_flag="--durable-kv-events",
        )
        add_negatable_bool_argument(
            g,
-            flag_name="--router-durable-kv-events",
+            flag_name="--router-replica-sync",
-            env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
+            env_var="DYN_ROUTER_REPLICA_SYNC",
            default=False,
-            help="[Deprecated] KV Router: Enable durable KV events using NATS JetStream. This option will be removed in a future release. The event-plane subscriber (local_indexer mode) is now the recommended path.",
+            help=(
-            obsolete_flag="--durable-kv-events",
+                "KV Router: Enable replica synchronization across multiple router instances. "
+                "When true, routers will publish and subscribe to events to maintain "
+                "consistent state."
+            ),
        )
        add_negatable_bool_argument(
            g,
            flag_name="--router-track-active-blocks",
            env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
            default=True,
-            help="KV Router: Track active blocks for load balancing. Use --no-router-track-active-blocks to disable",
+            dest="router_track_active_blocks",
+            help=(
+                "KV Router: Track active blocks (blocks being used for ongoing generation). "
+                "By default, active blocks are tracked for load balancing."
+            ),
            obsolete_flag="--track-active-blocks",
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--router-track-output-blocks",
+            env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
+            default=False,
+            dest="router_track_output_blocks",
+            help=(
+                "KV Router: Track output blocks during generation. When enabled, the router adds "
+                "placeholder blocks as tokens are generated and applies fractional decay based on "
+                "progress toward expected output sequence length."
+            ),
+            obsolete_flag="--track-output-blocks",
+        )
        add_negatable_bool_argument(
            g,
            flag_name="--router-assume-kv-reuse",
            env_var="DYN_ROUTER_ASSUME_KV_REUSE",
            default=True,
-            help="KV Router: When tracking active blocks, assume KV cache reuse. Use --no-router-assume-kv-reuse to use random hashes, useful when KV cache reuse is not expected.",
+            dest="router_assume_kv_reuse",
+            help=(
+                "KV Router: When tracking active blocks, assume KV cache reuse. "
+                "Use --no-router-assume-kv-reuse to generate random hashes instead "
+                "(when KV cache reuse is not expected)."
+            ),
            obsolete_flag="--assume-kv-reuse",
        )
+        add_argument(
+            g,
+            flag_name="--router-snapshot-threshold",
+            env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
+            default=1000000,
+            help="KV Router: Number of messages in stream before triggering a snapshot.",
+            arg_type=int,
+        )
        add_negatable_bool_argument(
            g,
-            flag_name="--router-track-output-blocks",
+            flag_name="--router-reset-states",
-            env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
+            env_var="DYN_ROUTER_RESET_STATES",
            default=False,
-            help="KV Router: Track output blocks during generation. When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward expected output sequence length (agent_hints.osl in nvext).",
+            help=(
-            obsolete_flag="--track-output-blocks",
+                "KV Router: Reset router state on startup, purging stream and object store. "
+                "WARNING: This can affect existing router replicas."
+            ),
        )
        add_argument(
            g,
            flag_name="--router-ttl-secs",
            env_var="DYN_ROUTER_TTL_SECS",
            default=120.0,
-            help="KV Router: TTL for blocks in seconds. Only used when --no-router-kv-events is set.  Controls how long cached blocks are considered valid without explicit events.",
+            help=(
+                "KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
+                "Only used when --no-router-kv-events is set."
+            ),
            arg_type=float,
        )
        add_argument(
            g,
            flag_name="--router-max-tree-size",
            env_var="DYN_ROUTER_MAX_TREE_SIZE",
            default=2**20,
-            help="KV Router: Maximum tree size before pruning. Only used when --no-router-kv-events is set.  When the indexer tree exceeds this size, pruning is triggered.",
+            help=(
+                "KV Router: Maximum tree size before pruning when KV events are disabled. "
+                "Only used when --no-router-kv-events is set."
+            ),
            arg_type=int,
        )
        add_argument(
            g,
            flag_name="--router-prune-target-ratio",
            env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
            default=0.8,
-            help="KV Router: Target size ratio after pruning (0.0-1.0). Only used when --no-router-kv-events is set. Determines how aggressively to prune the tree.",
+            help=(
+                "KV Router: Target size ratio after pruning when KV events are disabled. "
+                "Only used when --no-router-kv-events is set."
+            ),
+            arg_type=float,
+        )
+        add_argument(
+            g,
+            flag_name="--router-queue-threshold",
+            env_var="DYN_ROUTER_QUEUE_THRESHOLD",
+            default=None,
+            help=(
+                "KV Router: Queue threshold fraction for prefill token capacity. "
+                "When set, requests are queued if all workers exceed this fraction of "
+                "max_num_batched_tokens. Must be > 0. If not set, queueing is disabled."
+            ),
            arg_type=float,
        )
        add_argument(
            g,
            flag_name="--router-event-threads",
            env_var="DYN_ROUTER_EVENT_THREADS",
            default=4,
-            help="KV Router: Number of event processing threads. >1 uses concurrent radix tree and thread pool for higher throughput. Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning).",
+            help=(
+                "KV Router: Number of event processing threads. When > 1, uses a concurrent "
+                "radix tree with a thread pool for higher throughput. Ignored when "
+                "--no-router-kv-events is set."
+            ),
            arg_type=int,
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--enable-cache-control",
+            env_var="DYN_ENABLE_CACHE_CONTROL",
+            default=False,
+            dest="router_enable_cache_control",
+            help=(
+                "KV Router: Enable cache control (PIN with TTL). When set, the router creates "
+                "a cache_control service mesh client and fires pin_prefix after generation for "
+                "requests with nvext.cache_control."
+            ),
+        )
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -8,7 +8,10 @@ from typing import Any, Dict, Optional
 from dynamo.common.config_dump import register_encoder
 from dynamo.common.configuration.arg_group import ArgGroup
-from dynamo.common.configuration.config_base import ConfigBase
+from dynamo.common.configuration.groups.kv_router_args import (
+    KvRouterArgGroup,
+    KvRouterConfigBase,
+)
 from dynamo.common.configuration.utils import (
    add_argument,
    add_negatable_bool_argument,
@@ -36,7 +39,7 @@ def validate_model_path(value: str) -> str:
    return value
-class FrontendConfig(ConfigBase):
+class FrontendConfig(KvRouterConfigBase):
    """Configuration for the Dynamo frontend."""
    interactive: bool
@@ -47,24 +50,8 @@ class FrontendConfig(ConfigBase):
    tls_key_path: Optional[pathlib.Path]
    router_mode: str
-    kv_overlap_score_weight: float
-    router_temperature: float
-    use_kv_events: bool
-    router_ttl: float
-    router_max_tree_size: int
-    router_prune_target_ratio: float
    namespace: Optional[str] = None
    namespace_prefix: Optional[str] = None
-    router_replica_sync: bool
-    router_snapshot_threshold: int
-    router_reset_states: bool
-    durable_kv_events: bool
-    router_track_active_blocks: bool
-    router_assume_kv_reuse: bool
-    router_track_output_blocks: bool
-    router_event_threads: int
-    router_queue_threshold: Optional[float]
-    router_enable_cache_control: bool
    decode_fallback: bool
    migration_limit: int
@@ -186,78 +173,9 @@ class FrontendArgGroup(ArgGroup):
            help="How to route the request.",
            choices=["round-robin", "random", "kv", "direct"],
        )
-        add_argument(
-            g,
+        # KV router options (shared with dynamo.router)
-            flag_name="--router-kv-overlap-score-weight",
+        KvRouterArgGroup().add_arguments(parser)
-            env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
-            default=1.0,
-            help=(
-                "KV Router: Weight for overlap score in worker selection. "
-                "Higher values prioritize KV cache reuse."
-            ),
-            arg_type=float,
-            dest="kv_overlap_score_weight",
-            obsolete_flag="--kv-overlap-score-weight",
-        )
-        add_argument(
-            g,
-            flag_name="--router-temperature",
-            env_var="DYN_ROUTER_TEMPERATURE",
-            default=0.0,
-            help=(
-                "KV Router: Temperature for worker sampling via softmax. Higher values "
-                "promote more randomness, and 0 fallbacks to deterministic."
-            ),
-            arg_type=float,
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-kv-events",
-            env_var="DYN_ROUTER_USE_KV_EVENTS",
-            default=True,
-            help=(
-                "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
-                "(default, router receives cache state events from workers) or --no-router-kv-events "
-                "to disable (router predicts cache state based on routing decisions)."
-            ),
-            dest="use_kv_events",
-            obsolete_flag="--kv-events",
-        )
-        add_argument(
-            g,
-            flag_name="--router-ttl-secs",
-            env_var="DYN_ROUTER_TTL_SECS",
-            default=120.0,
-            help=(
-                "KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
-                "Only used when --no-router-kv-events is set."
-            ),
-            arg_type=float,
-            dest="router_ttl",
-            obsolete_flag="--router-ttl",
-        )
-        add_argument(
-            g,
-            flag_name="--router-max-tree-size",
-            env_var="DYN_ROUTER_MAX_TREE_SIZE",
-            default=2**20,
-            help=(
-                "KV Router: Maximum tree size before pruning when KV events are disabled. "
-                "Only used when --no-router-kv-events is set."
-            ),
-            arg_type=int,
-        )
-        add_argument(
-            g,
-            flag_name="--router-prune-target-ratio",
-            env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
-            default=0.8,
-            help=(
-                "KV Router: Target size ratio after pruning when KV events are disabled. "
-                "Only used when --no-router-kv-events is set."
-            ),
-            arg_type=float,
-        )
        add_argument(
            g,
@@ -271,124 +189,6 @@ class FrontendArgGroup(ArgGroup):
            ),
        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-replica-sync",
-            env_var="DYN_ROUTER_REPLICA_SYNC",
-            default=False,
-            help=(
-                "KV Router: Enable replica synchronization across multiple router instances. "
-                "When true, routers will publish and subscribe to events to maintain "
-                "consistent state."
-            ),
-        )
-        add_argument(
-            g,
-            flag_name="--router-snapshot-threshold",
-            env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
-            default=1000000,
-            help=(
-                "KV Router: Number of messages in stream before triggering a snapshot. "
-            ),
-            arg_type=int,
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-reset-states",
-            env_var="DYN_ROUTER_RESET_STATES",
-            default=False,
-            help=(
-                "KV Router: Reset router state on startup, purging stream and object store. "
-                "By default, states are persisted. WARNING: This can affect existing router "
-                "replicas."
-            ),
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-durable-kv-events",
-            env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
-            default=False,
-            help=(
-                "[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
-                "This option will be removed in a future release. The event-plane subscriber "
-                "(local_indexer mode) is now the recommended path."
-            ),
-            dest="durable_kv_events",
-            obsolete_flag="--durable-kv-events",
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-track-active-blocks",
-            env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
-            default=True,
-            dest="router_track_active_blocks",
-            help=(
-                "KV Router: Track active blocks (blocks being used for ongoing generation). "
-                "By default, active blocks are tracked for load balancing. "
-            ),
-            obsolete_flag="--track-active-blocks",
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-assume-kv-reuse",
-            env_var="DYN_ROUTER_ASSUME_KV_REUSE",
-            default=True,
-            dest="router_assume_kv_reuse",
-            help=(
-                "KV Router: When tracking active blocks, assume KV cache reuse. "
-                "Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
-            ),
-            obsolete_flag="--assume-kv-reuse",
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--router-track-output-blocks",
-            env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
-            default=False,
-            dest="router_track_output_blocks",
-            help=(
-                "KV Router: Track output blocks during generation. When enabled, the router adds "
-                "placeholder blocks as tokens are generated and applies fractional decay based on "
-                "progress toward expected_output_tokens."
-            ),
-            obsolete_flag="--track-output-blocks",
-        )
-        add_argument(
-            g,
-            flag_name="--router-event-threads",
-            env_var="DYN_ROUTER_EVENT_THREADS",
-            default=4,
-            help=(
-                "KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput. "
-                "Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning)."
-            ),
-            arg_type=int,
-        )
-        add_argument(
-            g,
-            flag_name="--router-queue-threshold",
-            env_var="DYN_ROUTER_QUEUE_THRESHOLD",
-            default=None,
-            help=(
-                "KV Router: Queue threshold fraction for prefill token capacity. "
-                "When set, requests are queued if all workers exceed this fraction of "
-                "max_num_batched_tokens. Enables priority scheduling via latency_sensitivity "
-                "hints. Must be > 0. If not set, queueing is disabled."
-            ),
-            arg_type=float,
-        )
-        add_negatable_bool_argument(
-            g,
-            flag_name="--enable-cache-control",
-            env_var="DYN_ENABLE_CACHE_CONTROL",
-            default=False,
-            dest="router_enable_cache_control",
-            help=(
-                "KV Router: Enable cache control (PIN with TTL). When set, the router creates "
-                "a cache_control service mesh client and fires pin_prefix after generation for "
-                "requests with nvext.cache_control. Requires --router-mode=kv."
-            ),
-        )
        add_negatable_bool_argument(
            g,
            flag_name="--decode-fallback",

--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -177,24 +177,7 @@ async def async_main():
    if config.router_mode == "kv":
        router_mode = RouterMode.KV
-        kv_router_config = KvRouterConfig(
+        kv_router_config = KvRouterConfig(**config.kv_router_kwargs())
-            overlap_score_weight=config.kv_overlap_score_weight,
-            router_temperature=config.router_temperature,
-            use_kv_events=config.use_kv_events,
-            durable_kv_events=config.durable_kv_events,
-            router_replica_sync=config.router_replica_sync,
-            router_track_active_blocks=config.router_track_active_blocks,
-            router_track_output_blocks=config.router_track_output_blocks,
-            router_assume_kv_reuse=config.router_assume_kv_reuse,
-            router_snapshot_threshold=config.router_snapshot_threshold,
-            router_reset_states=config.router_reset_states,
-            router_ttl_secs=config.router_ttl,
-            router_max_tree_size=config.router_max_tree_size,
-            router_prune_target_ratio=config.router_prune_target_ratio,
-            router_queue_threshold=config.router_queue_threshold,
-            router_event_threads=config.router_event_threads,
-            router_enable_cache_control=config.router_enable_cache_control,
-        )
    elif config.router_mode == "random":
        router_mode = RouterMode.Random
        kv_router_config = None

--- a/components/src/dynamo/router/__main__.py
+++ b/components/src/dynamo/router/__main__.py
@@ -19,9 +19,8 @@ from typing import Optional
 import uvloop
 from dynamo.llm import KvRouter, KvRouterConfig
-from dynamo.router.args import build_kv_router_config
+from dynamo.router.args import DynamoRouterConfig, build_kv_router_config
 from dynamo.router.args import parse_args as parse_router_args
-from dynamo.router.backend_args import DynamoRouterConfig
 from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -163,10 +162,10 @@ async def worker(runtime: DistributedRuntime):
    logger.info("Starting Standalone Router Service")
    logger.debug(
        f"Configuration: endpoint={config.endpoint}, router_block_size={config.router_block_size}, "
-        f"overlap_score_weight={config.router_kv_overlap_score_weight}, "
+        f"overlap_score_weight={config.overlap_score_weight}, "
        f"router_temperature={config.router_temperature}, "
-        f"router_use_kv_events={config.router_use_kv_events}, "
+        f"use_kv_events={config.use_kv_events}, "
-        f"router_durable_kv_events={config.router_durable_kv_events}, "
+        f"durable_kv_events={config.durable_kv_events}, "
        f"router_replica_sync={config.router_replica_sync}, "
        f"router_reset_states={config.router_reset_states}, "
        f"router_track_active_blocks={config.router_track_active_blocks}, "

--- a/components/src/dynamo/router/args.py
+++ b/components/src/dynamo/router/args.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-"""Router CLI parsing and config assembly."""
+"""Router CLI parsing, config, and assembly for the standalone router."""
 import argparse
+from dynamo.common.configuration.arg_group import ArgGroup
+from dynamo.common.configuration.groups.kv_router_args import (
+    KvRouterArgGroup,
+    KvRouterConfigBase,
+)
+from dynamo.common.configuration.utils import add_argument
 from dynamo.llm import KvRouterConfig
-from .backend_args import DynamoRouterArgGroup, DynamoRouterConfig
+class DynamoRouterConfig(KvRouterConfigBase):
+    """Typed configuration for the standalone KV router (router-owned options only)."""
-def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
+    namespace: str
-    """Build KvRouterConfig from DynamoRouterConfig.
+    endpoint: str
+    router_block_size: int
-    Maps CLI/config attribute names to KvRouterConfig constructor kwargs.
+    def validate(self) -> None:
-    The only name difference is router_kv_overlap_score_weight -> overlap_score_weight.
+        """Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
-    """
+        if not self.endpoint:
-    return KvRouterConfig(
+            raise ValueError(
-        overlap_score_weight=router_config.router_kv_overlap_score_weight,
+                "endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
-        router_temperature=router_config.router_temperature,
+            )
-        use_kv_events=router_config.router_use_kv_events,
-        durable_kv_events=router_config.router_durable_kv_events,
+        parts = self.endpoint.split(".")
-        router_replica_sync=router_config.router_replica_sync,
+        if len(parts) != 3:
-        router_track_active_blocks=router_config.router_track_active_blocks,
+            raise ValueError(
-        router_track_output_blocks=router_config.router_track_output_blocks,
+                f"Invalid endpoint format: {self.endpoint!r}. "
-        router_assume_kv_reuse=router_config.router_assume_kv_reuse,
+                "Expected format: namespace.component.endpoint"
-        router_snapshot_threshold=router_config.router_snapshot_threshold,
+            )
-        router_reset_states=router_config.router_reset_states,
+        self.namespace = parts[0]
-        router_ttl_secs=router_config.router_ttl_secs,
-        router_max_tree_size=router_config.router_max_tree_size,
-        router_prune_target_ratio=router_config.router_prune_target_ratio,
+class DynamoRouterArgGroup(ArgGroup):
-        router_event_threads=router_config.router_event_threads,
+    """CLI argument group for standalone router options."""
+    name = "dynamo-router"
+    def add_arguments(self, parser) -> None:
+        """Add router-owned arguments to parser."""
+        g = parser.add_argument_group("Dynamo Router Options")
+        add_argument(
+            g,
+            flag_name="--endpoint",
+            env_var="DYN_ROUTER_ENDPOINT",
+            default=None,
+            help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
+            arg_type=str,
+        )
+        add_argument(
+            g,
+            flag_name="--router-block-size",
+            env_var="DYN_ROUTER_BLOCK_SIZE",
+            default=128,
+            help="KV cache block size for routing decisions",
+            arg_type=int,
+            obsolete_flag="--block-size",
        )
+        # KV router options (shared with dynamo.frontend)
+        KvRouterArgGroup().add_arguments(parser)
+def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
+    """Build KvRouterConfig from DynamoRouterConfig."""
+    return KvRouterConfig(**router_config.kv_router_kwargs())
 def parse_args(argv=None) -> DynamoRouterConfig:
    """Parse command-line arguments for the standalone router.

--- a/docs/backends/sglang/agents.md
+++ b/docs/backends/sglang/agents.md
@@ -7,7 +7,7 @@ subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for
 # SGLang for Agentic Workloads
-This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/router/agent-hints.md) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
+This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
 ## Overview
@@ -301,7 +301,6 @@ A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that t
 ## See Also
- **[Agent Hints](../../components/router/agent-hints.md)**: Per-request hint reference
+- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints
- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference
 - **[Router Guide](../../components/router/router-guide.md)**: Router configuration and CLI arguments
 - **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Enabling hierarchical KV cache
--- a/docs/benchmarks/kv-router-ab-testing.md
+++ b/docs/benchmarks/kv-router-ab-testing.md
@@ -783,7 +783,7 @@ VllmPrefillWorker:
 ## Conclusion
-This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit.
+This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit. For further details on tuning the KV router, see the [Tuning Guidelines](../components/router/router-guide.md#tuning-guidelines).
 For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub.

--- a/docs/components/frontend/nvext.md
+++ b/docs/components/frontend/nvext.md
@@ -64,7 +64,7 @@ The `agent_hints` sub-object carries per-request hints that the router uses for
 ### `latency_sensitivity`
-When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled.
+When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. A recommended default is `1.2` for latency-sensitive agentic requests. Has no effect when queueing is disabled.
 ```json
 {
@@ -195,3 +195,4 @@ When the client requests response metadata via `extra_fields`, the response incl
 |----------|-------------|
 | [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration |
 | [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments |
+| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling, eviction policies, and cache pinning |
--- a/docs/components/router/README.md
+++ b/docs/components/router/README.md
@@ -8,75 +8,22 @@ The Dynamo KV Router intelligently routes requests by evaluating their computati
 ## Quick Start
-### Python / CLI Deployment
 To launch the Dynamo frontend with the KV Router:
 ```bash
 python -m dynamo.frontend --router-mode kv --http-port 8000
 ```
-This command:
+For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automatically report KV cache events — no worker-side configuration changes needed.
- Launches the Dynamo frontend service with KV routing enabled
- Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint
-Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
-#### CLI Arguments
 | Argument | Default | Description |
 |----------|---------|-------------|
 | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
-| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
+| `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
-| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
+| `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
-| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
+| `--router-queue-threshold` | disabled | Enable backpressure queue under high concurrency; also enables priority scheduling via `nvext.agent_hints.latency_sensitivity` |
-| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
-For all available options: `python -m dynamo.frontend --help`
-### Kubernetes Deployment
-To enable the KV Router in Kubernetes, add the `DYN_ROUTER_MODE` environment variable to your frontend service:
-```yaml
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: my-deployment
-spec:
-  services:
-    Frontend:
-      dynamoNamespace: my-namespace
-      componentType: frontend
-      replicas: 1
-      envs:
-        - name: DYN_ROUTER_MODE
-          value: kv  # Enable KV Smart Router
-```
-**Key Points:**
- Set `DYN_ROUTER_MODE=kv` on the **Frontend** service only
- Workers automatically report KV cache events to the router
- No worker-side configuration changes needed
-#### Environment Variables
-All CLI arguments can be configured via environment variables using the `DYN_` prefix:
-| CLI Argument | Environment Variable | Default |
-|--------------|---------------------|---------|
-| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
-| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
-| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
-| `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
-| `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
-For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
-For A/B testing and advanced K8s setup, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
-For more configuration options and tuning guidelines, see the [Router Guide](router-guide.md).
+For all CLI arguments, environment variables, K8s deployment examples, and tuning guidelines, see the [Router Guide](router-guide.md). For A/B benchmarking, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
 ## Prerequisites and Limitations
@@ -99,4 +46,5 @@ For basic model registration without KV routing, use `--router-mode round-robin`
 - **[Router Guide](router-guide.md)**: Deep dive into KV cache routing, configuration, disaggregated serving, and tuning
 - **[Router Examples](router-examples.md)**: Python API usage, K8s examples, and custom routing patterns
+- **[Standalone Indexer](standalone-indexer.md)**: Run the KV indexer as a separate service for independent scaling
 - **[Router Design](../../design-docs/router-design.md)**: Architecture details, algorithms, and event transport modes
--- a/docs/components/router/agent-hints.md
+++ b/docs/components/router/agent-hints.md
---
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-title: Agent Hints
-subtitle: Per-request hints for scheduling, load balancing, and KV cache optimization
---
-Agent hints are optional per-request hints passed via the `nvext.agent_hints` field in the request body. They allow the calling agent or application to communicate request-level metadata that the router uses to improve scheduling, load balancing, and KV cache utilization.
-```json
-{
-  "nvext": {
-    "agent_hints": {
-      "latency_sensitivity": 5.0,
-      "osl": 512,
-      "speculative_prefill": true
-    }
-  }
-}
-```
-All three fields are optional and independent — you can use any combination.
-## `latency_sensitivity`
-Priority scheduling hint, specified in seconds. When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled.
- **Type**: `f64` (optional)
- **Recommended default**: `1.2` for latency-sensitive agentic requests
- **Requires**: `--router-queue-threshold` to be set
-### Example
-```json
-{
-  "nvext": {
-    "agent_hints": {
-      "latency_sensitivity": 5.0
-    }
-  }
-}
-```
-A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if it arrived at `T - 5s`, so it will be scheduled ahead of requests that arrived within the last 5 seconds (unless they have even higher sensitivity).
-## `osl`
-Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
-1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
-2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional)
- **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
-### Example
-```json
-{
-  "nvext": {
-    "agent_hints": {
-      "osl": 1024
-    }
-  }
-}
-```
-If the request is expected to generate ~1024 tokens, providing `osl: 1024` lets the router account for the output-side KV cache growth when balancing load across workers.
-## `speculative_prefill`
-When set to `true`, the system speculatively prefills the predicted next-turn prompt after the current assistant turn completes. This is designed for multi-turn agentic workloads where the next request's prefix is predictable.
- **Type**: `bool` (optional, defaults to `false`)
- **No additional CLI flags required**; works automatically when the hint is set in the request
-### How it works
-1. As the assistant response streams, the system accumulates the full response text.
-2. Once the response finishes (indicated by `finish_reason`), a background task constructs the next-turn prompt by appending the assistant response to the conversation history (with thinking content stripped by the chat template for non-last assistant turns).
-3. The constructed prompt is tokenized and sent through the pipeline as a `max_tokens=1` request to warm the KV cache on a worker.
-4. When the actual next request arrives, it benefits from the already-warm KV cache, reducing TTFT.
-### Example
-```json
-{
-  "nvext": {
-    "agent_hints": {
-      "speculative_prefill": true
-    }
-  }
-}
-```
-This is most effective for reasoning models in agentic loops, where the conversation grows incrementally and the next turn's prefix (everything up to the new user message) is the same as the current conversation.
-## See Also
- **[SGLang for Agentic Workloads](../../backends/sglang/agents.md)**: SGLang engine flags for priority scheduling, eviction policies, and cache pinning
- **[NVIDIA Request Extensions (nvext)](../frontend/nvext.md)**: Full `nvext` field reference including `cache_control`
- **[Router Guide](router-guide.md)**: Full router configuration and CLI arguments
- **[Router Examples](router-examples.md)**: Usage patterns and benchmarking
--- a/docs/components/router/router-guide.md
+++ b/docs/components/router/router-guide.md
--- a/lib/bench/src/bin/README.md
+++ b/lib/bench/src/bin/README.md
@@ -101,4 +101,4 @@ request arrives.
 4. The KV router routes the speculative request to the same worker, warming its cache.
 5. When the real next-turn request arrives, the KV router sees high cache overlap on that worker and routes there, yielding a much lower TTFT.
-See also: [Agent Hints documentation](../../../../docs/components/router/agent-hints.md)
+See also: [Agent Hints documentation](../../../../docs/components/frontend/nvext.md#agent-hints)