Unverified Commit 9f3b7b33 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(kv-router): deduplicate KvRouterConfig args into shared ArgGroup (#6805)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 0da960f1
...@@ -3,6 +3,12 @@ ...@@ -3,6 +3,12 @@
"""ArgGroup implementations for different configuration domains.""" """ArgGroup implementations for different configuration domains."""
from .kv_router_args import KvRouterArgGroup, KvRouterConfigBase
from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig
__all__ = ["DynamoRuntimeArgGroup", "DynamoRuntimeConfig"] __all__ = [
"DynamoRuntimeArgGroup",
"DynamoRuntimeConfig",
"KvRouterArgGroup",
"KvRouterConfigBase",
]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Dynamo standalone router configuration ArgGroup.""" """Shared KV router configuration ArgGroup.
Defines the 16 KvRouterConfig parameters once so that both
``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication.
Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python
constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be
unpacked directly into ``KvRouterConfig(**config.kv_router_kwargs())``.
"""
from typing import Optional
from dynamo.common.configuration.arg_group import ArgGroup from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
# Authoritative field list — used by kv_router_kwargs() to extract values.
_KV_ROUTER_FIELDS: tuple[str, ...] = (
"overlap_score_weight",
"router_temperature",
"use_kv_events",
"durable_kv_events",
"router_replica_sync",
"router_track_active_blocks",
"router_track_output_blocks",
"router_assume_kv_reuse",
"router_snapshot_threshold",
"router_reset_states",
"router_ttl_secs",
"router_max_tree_size",
"router_prune_target_ratio",
"router_queue_threshold",
"router_event_threads",
"router_enable_cache_control",
)
class DynamoRouterConfig(ConfigBase): class KvRouterConfigBase(ConfigBase):
"""Typed configuration for the standalone KV router (router-owned options only).""" """Mixin carrying the 16 KvRouterConfig fields."""
namespace: str overlap_score_weight: float
endpoint: str
router_block_size: int
router_kv_overlap_score_weight: float
router_temperature: float router_temperature: float
router_use_kv_events: bool use_kv_events: bool
durable_kv_events: bool
router_replica_sync: bool router_replica_sync: bool
router_snapshot_threshold: int
router_reset_states: bool
router_durable_kv_events: bool
router_track_active_blocks: bool router_track_active_blocks: bool
router_assume_kv_reuse: bool
router_track_output_blocks: bool router_track_output_blocks: bool
router_assume_kv_reuse: bool
router_snapshot_threshold: int
router_reset_states: bool
router_ttl_secs: float router_ttl_secs: float
router_max_tree_size: int router_max_tree_size: int
router_prune_target_ratio: float router_prune_target_ratio: float
router_queue_threshold: Optional[float]
router_event_threads: int router_event_threads: int
router_enable_cache_control: bool
def validate(self) -> None: def kv_router_kwargs(self) -> dict:
"""Validate config invariants (aligned with Rust KvRouterConfig where applicable).""" """Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
if not self.endpoint: return {f: getattr(self, f) for f in _KV_ROUTER_FIELDS}
raise ValueError(
"endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
)
parts = self.endpoint.split(".")
if len(parts) != 3:
raise ValueError(
f"Invalid endpoint format: {self.endpoint!r}. "
"Expected format: namespace.component.endpoint"
)
self.namespace = parts[0]
class DynamoRouterArgGroup(ArgGroup):
"""CLI argument group for standalone router options."""
name = "dynamo-router" class KvRouterArgGroup(ArgGroup):
"""CLI arguments for the 16 KvRouterConfig parameters."""
def add_arguments(self, parser) -> None: def add_arguments(self, parser) -> None:
"""Add router-owned arguments to parser.""" g = parser.add_argument_group("KV Router Options")
g = parser.add_argument_group("Dynamo Router Options")
add_argument(
g,
flag_name="--endpoint",
env_var="DYN_ROUTER_ENDPOINT",
default=None,
help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
arg_type=str,
)
add_argument(
g,
flag_name="--router-block-size",
env_var="DYN_ROUTER_BLOCK_SIZE",
default=128,
help="KV cache block size for routing decisions",
arg_type=int,
obsolete_flag="--block-size",
)
add_argument( add_argument(
g, g,
flag_name="--router-kv-overlap-score-weight", flag_name="--router-kv-overlap-score-weight",
env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT", env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0, default=1.0,
help="KV Router: Weight for overlap score in worker selection. Higher values prioritize KV cache reuse", help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float, arg_type=float,
dest="overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight", obsolete_flag="--kv-overlap-score-weight",
) )
add_argument( add_argument(
g, g,
flag_name="--router-temperature", flag_name="--router-temperature",
env_var="DYN_ROUTER_TEMPERATURE", env_var="DYN_ROUTER_TEMPERATURE",
default=0.0, default=0.0,
help="KV Router: Temperature for worker sampling via softmax. Higher values promote more randomness, and 0 fallbacks to deterministic.", help=(
"KV Router: Temperature for worker sampling via softmax. Higher values "
"promote more randomness, and 0 fallbacks to deterministic."
),
arg_type=float, arg_type=float,
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-kv-events", flag_name="--router-kv-events",
env_var="DYN_ROUTER_USE_KV_EVENTS", env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True, default=True,
help="KV Router: Enable KV events from workers. When disabled (--no-router-kv-events), the router predicts cache state based on routing decisions with TTL-based expiration and pruning, rather than receiving events from workers.", help=(
dest="router_use_kv_events", "KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
obsolete_flag="--kv-events", obsolete_flag="--kv-events",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-replica-sync", flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_REPLICA_SYNC", env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False,
help="KV Router: Enable replica synchronization across multiple router instances. When true, routers will publish and subscribe to events to maintain consistent state.",
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help="KV Router: Number of messages in stream before triggering a snapshot",
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
default=False, default=False,
help="KV Router: Reset router state on startup, purging stream and object store. WARNING: Can affect existing router replicas.", help=(
"[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
"This option will be removed in a future release. The event-plane subscriber "
"(local_indexer mode) is now the recommended path."
),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-durable-kv-events", flag_name="--router-replica-sync",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS", env_var="DYN_ROUTER_REPLICA_SYNC",
default=False, default=False,
help="[Deprecated] KV Router: Enable durable KV events using NATS JetStream. This option will be removed in a future release. The event-plane subscriber (local_indexer mode) is now the recommended path.", help=(
obsolete_flag="--durable-kv-events", "KV Router: Enable replica synchronization across multiple router instances. "
"When true, routers will publish and subscribe to events to maintain "
"consistent state."
),
) )
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-track-active-blocks", flag_name="--router-track-active-blocks",
env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS", env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True, default=True,
help="KV Router: Track active blocks for load balancing. Use --no-router-track-active-blocks to disable", dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing."
),
obsolete_flag="--track-active-blocks", obsolete_flag="--track-active-blocks",
) )
add_negatable_bool_argument(
g,
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
help=(
"KV Router: Track output blocks during generation. When enabled, the router adds "
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected output sequence length."
),
obsolete_flag="--track-output-blocks",
)
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-assume-kv-reuse", flag_name="--router-assume-kv-reuse",
env_var="DYN_ROUTER_ASSUME_KV_REUSE", env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True, default=True,
help="KV Router: When tracking active blocks, assume KV cache reuse. Use --no-router-assume-kv-reuse to use random hashes, useful when KV cache reuse is not expected.", dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-router-assume-kv-reuse to generate random hashes instead "
"(when KV cache reuse is not expected)."
),
obsolete_flag="--assume-kv-reuse", obsolete_flag="--assume-kv-reuse",
) )
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help="KV Router: Number of messages in stream before triggering a snapshot.",
arg_type=int,
)
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--router-track-output-blocks", flag_name="--router-reset-states",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS", env_var="DYN_ROUTER_RESET_STATES",
default=False, default=False,
help="KV Router: Track output blocks during generation. When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward expected output sequence length (agent_hints.osl in nvext).", help=(
obsolete_flag="--track-output-blocks", "KV Router: Reset router state on startup, purging stream and object store. "
"WARNING: This can affect existing router replicas."
),
) )
add_argument( add_argument(
g, g,
flag_name="--router-ttl-secs", flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL_SECS", env_var="DYN_ROUTER_TTL_SECS",
default=120.0, default=120.0,
help="KV Router: TTL for blocks in seconds. Only used when --no-router-kv-events is set. Controls how long cached blocks are considered valid without explicit events.", help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float, arg_type=float,
) )
add_argument( add_argument(
g, g,
flag_name="--router-max-tree-size", flag_name="--router-max-tree-size",
env_var="DYN_ROUTER_MAX_TREE_SIZE", env_var="DYN_ROUTER_MAX_TREE_SIZE",
default=2**20, default=2**20,
help="KV Router: Maximum tree size before pruning. Only used when --no-router-kv-events is set. When the indexer tree exceeds this size, pruning is triggered.", help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=int, arg_type=int,
) )
add_argument( add_argument(
g, g,
flag_name="--router-prune-target-ratio", flag_name="--router-prune-target-ratio",
env_var="DYN_ROUTER_PRUNE_TARGET_RATIO", env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
default=0.8, default=0.8,
help="KV Router: Target size ratio after pruning (0.0-1.0). Only used when --no-router-kv-events is set. Determines how aggressively to prune the tree.", help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-queue-threshold",
env_var="DYN_ROUTER_QUEUE_THRESHOLD",
default=None,
help=(
"KV Router: Queue threshold fraction for prefill token capacity. "
"When set, requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Must be > 0. If not set, queueing is disabled."
),
arg_type=float, arg_type=float,
) )
add_argument( add_argument(
g, g,
flag_name="--router-event-threads", flag_name="--router-event-threads",
env_var="DYN_ROUTER_EVENT_THREADS", env_var="DYN_ROUTER_EVENT_THREADS",
default=4, default=4,
help="KV Router: Number of event processing threads. >1 uses concurrent radix tree and thread pool for higher throughput. Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning).", help=(
"KV Router: Number of event processing threads. When > 1, uses a concurrent "
"radix tree with a thread pool for higher throughput. Ignored when "
"--no-router-kv-events is set."
),
arg_type=int, arg_type=int,
) )
add_negatable_bool_argument(
g,
flag_name="--enable-cache-control",
env_var="DYN_ENABLE_CACHE_CONTROL",
default=False,
dest="router_enable_cache_control",
help=(
"KV Router: Enable cache control (PIN with TTL). When set, the router creates "
"a cache_control service mesh client and fires pin_prefix after generation for "
"requests with nvext.cache_control."
),
)
...@@ -8,7 +8,10 @@ from typing import Any, Dict, Optional ...@@ -8,7 +8,10 @@ from typing import Any, Dict, Optional
from dynamo.common.config_dump import register_encoder from dynamo.common.config_dump import register_encoder
from dynamo.common.configuration.arg_group import ArgGroup from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import ( from dynamo.common.configuration.utils import (
add_argument, add_argument,
add_negatable_bool_argument, add_negatable_bool_argument,
...@@ -36,7 +39,7 @@ def validate_model_path(value: str) -> str: ...@@ -36,7 +39,7 @@ def validate_model_path(value: str) -> str:
return value return value
class FrontendConfig(ConfigBase): class FrontendConfig(KvRouterConfigBase):
"""Configuration for the Dynamo frontend.""" """Configuration for the Dynamo frontend."""
interactive: bool interactive: bool
...@@ -47,24 +50,8 @@ class FrontendConfig(ConfigBase): ...@@ -47,24 +50,8 @@ class FrontendConfig(ConfigBase):
tls_key_path: Optional[pathlib.Path] tls_key_path: Optional[pathlib.Path]
router_mode: str router_mode: str
kv_overlap_score_weight: float
router_temperature: float
use_kv_events: bool
router_ttl: float
router_max_tree_size: int
router_prune_target_ratio: float
namespace: Optional[str] = None namespace: Optional[str] = None
namespace_prefix: Optional[str] = None namespace_prefix: Optional[str] = None
router_replica_sync: bool
router_snapshot_threshold: int
router_reset_states: bool
durable_kv_events: bool
router_track_active_blocks: bool
router_assume_kv_reuse: bool
router_track_output_blocks: bool
router_event_threads: int
router_queue_threshold: Optional[float]
router_enable_cache_control: bool
decode_fallback: bool decode_fallback: bool
migration_limit: int migration_limit: int
...@@ -186,78 +173,9 @@ class FrontendArgGroup(ArgGroup): ...@@ -186,78 +173,9 @@ class FrontendArgGroup(ArgGroup):
help="How to route the request.", help="How to route the request.",
choices=["round-robin", "random", "kv", "direct"], choices=["round-robin", "random", "kv", "direct"],
) )
add_argument(
g, # KV router options (shared with dynamo.router)
flag_name="--router-kv-overlap-score-weight", KvRouterArgGroup().add_arguments(parser)
env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0,
help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float,
dest="kv_overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
)
add_argument(
g,
flag_name="--router-temperature",
env_var="DYN_ROUTER_TEMPERATURE",
default=0.0,
help=(
"KV Router: Temperature for worker sampling via softmax. Higher values "
"promote more randomness, and 0 fallbacks to deterministic."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--router-kv-events",
env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True,
help=(
"KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
obsolete_flag="--kv-events",
)
add_argument(
g,
flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL_SECS",
default=120.0,
help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
dest="router_ttl",
obsolete_flag="--router-ttl",
)
add_argument(
g,
flag_name="--router-max-tree-size",
env_var="DYN_ROUTER_MAX_TREE_SIZE",
default=2**20,
help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-prune-target-ratio",
env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
default=0.8,
help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
add_argument( add_argument(
g, g,
...@@ -271,124 +189,6 @@ class FrontendArgGroup(ArgGroup): ...@@ -271,124 +189,6 @@ class FrontendArgGroup(ArgGroup):
), ),
) )
add_negatable_bool_argument(
g,
flag_name="--router-replica-sync",
env_var="DYN_ROUTER_REPLICA_SYNC",
default=False,
help=(
"KV Router: Enable replica synchronization across multiple router instances. "
"When true, routers will publish and subscribe to events to maintain "
"consistent state."
),
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help=(
"KV Router: Number of messages in stream before triggering a snapshot. "
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
default=False,
help=(
"KV Router: Reset router state on startup, purging stream and object store. "
"By default, states are persisted. WARNING: This can affect existing router "
"replicas."
),
)
add_negatable_bool_argument(
g,
flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False,
help=(
"[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
"This option will be removed in a future release. The event-plane subscriber "
"(local_indexer mode) is now the recommended path."
),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
)
add_negatable_bool_argument(
g,
flag_name="--router-track-active-blocks",
env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True,
dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. "
),
obsolete_flag="--track-active-blocks",
)
add_negatable_bool_argument(
g,
flag_name="--router-assume-kv-reuse",
env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True,
dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
),
obsolete_flag="--assume-kv-reuse",
)
add_negatable_bool_argument(
g,
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
help=(
"KV Router: Track output blocks during generation. When enabled, the router adds "
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens."
),
obsolete_flag="--track-output-blocks",
)
add_argument(
g,
flag_name="--router-event-threads",
env_var="DYN_ROUTER_EVENT_THREADS",
default=4,
help=(
"KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput. "
"Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning)."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-queue-threshold",
env_var="DYN_ROUTER_QUEUE_THRESHOLD",
default=None,
help=(
"KV Router: Queue threshold fraction for prefill token capacity. "
"When set, requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Enables priority scheduling via latency_sensitivity "
"hints. Must be > 0. If not set, queueing is disabled."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--enable-cache-control",
env_var="DYN_ENABLE_CACHE_CONTROL",
default=False,
dest="router_enable_cache_control",
help=(
"KV Router: Enable cache control (PIN with TTL). When set, the router creates "
"a cache_control service mesh client and fires pin_prefix after generation for "
"requests with nvext.cache_control. Requires --router-mode=kv."
),
)
add_negatable_bool_argument( add_negatable_bool_argument(
g, g,
flag_name="--decode-fallback", flag_name="--decode-fallback",
......
...@@ -177,24 +177,7 @@ async def async_main(): ...@@ -177,24 +177,7 @@ async def async_main():
if config.router_mode == "kv": if config.router_mode == "kv":
router_mode = RouterMode.KV router_mode = RouterMode.KV
kv_router_config = KvRouterConfig( kv_router_config = KvRouterConfig(**config.kv_router_kwargs())
overlap_score_weight=config.kv_overlap_score_weight,
router_temperature=config.router_temperature,
use_kv_events=config.use_kv_events,
durable_kv_events=config.durable_kv_events,
router_replica_sync=config.router_replica_sync,
router_track_active_blocks=config.router_track_active_blocks,
router_track_output_blocks=config.router_track_output_blocks,
router_assume_kv_reuse=config.router_assume_kv_reuse,
router_snapshot_threshold=config.router_snapshot_threshold,
router_reset_states=config.router_reset_states,
router_ttl_secs=config.router_ttl,
router_max_tree_size=config.router_max_tree_size,
router_prune_target_ratio=config.router_prune_target_ratio,
router_queue_threshold=config.router_queue_threshold,
router_event_threads=config.router_event_threads,
router_enable_cache_control=config.router_enable_cache_control,
)
elif config.router_mode == "random": elif config.router_mode == "random":
router_mode = RouterMode.Random router_mode = RouterMode.Random
kv_router_config = None kv_router_config = None
......
...@@ -19,9 +19,8 @@ from typing import Optional ...@@ -19,9 +19,8 @@ from typing import Optional
import uvloop import uvloop
from dynamo.llm import KvRouter, KvRouterConfig from dynamo.llm import KvRouter, KvRouterConfig
from dynamo.router.args import build_kv_router_config from dynamo.router.args import DynamoRouterConfig, build_kv_router_config
from dynamo.router.args import parse_args as parse_router_args from dynamo.router.args import parse_args as parse_router_args
from dynamo.router.backend_args import DynamoRouterConfig
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -163,10 +162,10 @@ async def worker(runtime: DistributedRuntime): ...@@ -163,10 +162,10 @@ async def worker(runtime: DistributedRuntime):
logger.info("Starting Standalone Router Service") logger.info("Starting Standalone Router Service")
logger.debug( logger.debug(
f"Configuration: endpoint={config.endpoint}, router_block_size={config.router_block_size}, " f"Configuration: endpoint={config.endpoint}, router_block_size={config.router_block_size}, "
f"overlap_score_weight={config.router_kv_overlap_score_weight}, " f"overlap_score_weight={config.overlap_score_weight}, "
f"router_temperature={config.router_temperature}, " f"router_temperature={config.router_temperature}, "
f"router_use_kv_events={config.router_use_kv_events}, " f"use_kv_events={config.use_kv_events}, "
f"router_durable_kv_events={config.router_durable_kv_events}, " f"durable_kv_events={config.durable_kv_events}, "
f"router_replica_sync={config.router_replica_sync}, " f"router_replica_sync={config.router_replica_sync}, "
f"router_reset_states={config.router_reset_states}, " f"router_reset_states={config.router_reset_states}, "
f"router_track_active_blocks={config.router_track_active_blocks}, " f"router_track_active_blocks={config.router_track_active_blocks}, "
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Router CLI parsing and config assembly.""" """Router CLI parsing, config, and assembly for the standalone router."""
import argparse import argparse
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import add_argument
from dynamo.llm import KvRouterConfig from dynamo.llm import KvRouterConfig
from .backend_args import DynamoRouterArgGroup, DynamoRouterConfig
class DynamoRouterConfig(KvRouterConfigBase):
"""Typed configuration for the standalone KV router (router-owned options only)."""
def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig: namespace: str
"""Build KvRouterConfig from DynamoRouterConfig. endpoint: str
router_block_size: int
Maps CLI/config attribute names to KvRouterConfig constructor kwargs. def validate(self) -> None:
The only name difference is router_kv_overlap_score_weight -> overlap_score_weight. """Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
""" if not self.endpoint:
return KvRouterConfig( raise ValueError(
overlap_score_weight=router_config.router_kv_overlap_score_weight, "endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
router_temperature=router_config.router_temperature, )
use_kv_events=router_config.router_use_kv_events,
durable_kv_events=router_config.router_durable_kv_events, parts = self.endpoint.split(".")
router_replica_sync=router_config.router_replica_sync, if len(parts) != 3:
router_track_active_blocks=router_config.router_track_active_blocks, raise ValueError(
router_track_output_blocks=router_config.router_track_output_blocks, f"Invalid endpoint format: {self.endpoint!r}. "
router_assume_kv_reuse=router_config.router_assume_kv_reuse, "Expected format: namespace.component.endpoint"
router_snapshot_threshold=router_config.router_snapshot_threshold, )
router_reset_states=router_config.router_reset_states, self.namespace = parts[0]
router_ttl_secs=router_config.router_ttl_secs,
router_max_tree_size=router_config.router_max_tree_size,
router_prune_target_ratio=router_config.router_prune_target_ratio, class DynamoRouterArgGroup(ArgGroup):
router_event_threads=router_config.router_event_threads, """CLI argument group for standalone router options."""
name = "dynamo-router"
def add_arguments(self, parser) -> None:
"""Add router-owned arguments to parser."""
g = parser.add_argument_group("Dynamo Router Options")
add_argument(
g,
flag_name="--endpoint",
env_var="DYN_ROUTER_ENDPOINT",
default=None,
help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
arg_type=str,
)
add_argument(
g,
flag_name="--router-block-size",
env_var="DYN_ROUTER_BLOCK_SIZE",
default=128,
help="KV cache block size for routing decisions",
arg_type=int,
obsolete_flag="--block-size",
) )
# KV router options (shared with dynamo.frontend)
KvRouterArgGroup().add_arguments(parser)
def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
"""Build KvRouterConfig from DynamoRouterConfig."""
return KvRouterConfig(**router_config.kv_router_kwargs())
def parse_args(argv=None) -> DynamoRouterConfig: def parse_args(argv=None) -> DynamoRouterConfig:
"""Parse command-line arguments for the standalone router. """Parse command-line arguments for the standalone router.
......
...@@ -7,7 +7,7 @@ subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for ...@@ -7,7 +7,7 @@ subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for
# SGLang for Agentic Workloads # SGLang for Agentic Workloads
This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/router/agent-hints.md) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations. This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
## Overview ## Overview
...@@ -301,7 +301,6 @@ A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that t ...@@ -301,7 +301,6 @@ A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that t
## See Also ## See Also
- **[Agent Hints](../../components/router/agent-hints.md)**: Per-request hint reference - **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints
- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference
- **[Router Guide](../../components/router/router-guide.md)**: Router configuration and CLI arguments - **[Router Guide](../../components/router/router-guide.md)**: Router configuration and CLI arguments
- **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Enabling hierarchical KV cache - **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Enabling hierarchical KV cache
...@@ -783,7 +783,7 @@ VllmPrefillWorker: ...@@ -783,7 +783,7 @@ VllmPrefillWorker:
## Conclusion ## Conclusion
This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit. This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit. For further details on tuning the KV router, see the [Tuning Guidelines](../components/router/router-guide.md#tuning-guidelines).
For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub. For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub.
......
...@@ -64,7 +64,7 @@ The `agent_hints` sub-object carries per-request hints that the router uses for ...@@ -64,7 +64,7 @@ The `agent_hints` sub-object carries per-request hints that the router uses for
### `latency_sensitivity` ### `latency_sensitivity`
When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled. When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. A recommended default is `1.2` for latency-sensitive agentic requests. Has no effect when queueing is disabled.
```json ```json
{ {
...@@ -195,3 +195,4 @@ When the client requests response metadata via `extra_fields`, the response incl ...@@ -195,3 +195,4 @@ When the client requests response metadata via `extra_fields`, the response incl
|----------|-------------| |----------|-------------|
| [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration | | [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration |
| [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments | | [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments |
| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling, eviction policies, and cache pinning |
...@@ -8,75 +8,22 @@ The Dynamo KV Router intelligently routes requests by evaluating their computati ...@@ -8,75 +8,22 @@ The Dynamo KV Router intelligently routes requests by evaluating their computati
## Quick Start ## Quick Start
### Python / CLI Deployment
To launch the Dynamo frontend with the KV Router: To launch the Dynamo frontend with the KV Router:
```bash ```bash
python -m dynamo.frontend --router-mode kv --http-port 8000 python -m dynamo.frontend --router-mode kv --http-port 8000
``` ```
This command: For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automatically report KV cache events — no worker-side configuration changes needed.
- Launches the Dynamo frontend service with KV routing enabled
- Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments
| Argument | Default | Description | | Argument | Default | Description |
|----------|---------|-------------| |----------|---------|-------------|
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) | | `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking | | `--router-queue-threshold` | disabled | Enable backpressure queue under high concurrency; also enables priority scheduling via `nvext.agent_hints.latency_sensitivity` |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
For all available options: `python -m dynamo.frontend --help`
### Kubernetes Deployment
To enable the KV Router in Kubernetes, add the `DYN_ROUTER_MODE` environment variable to your frontend service:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-deployment
spec:
services:
Frontend:
dynamoNamespace: my-namespace
componentType: frontend
replicas: 1
envs:
- name: DYN_ROUTER_MODE
value: kv # Enable KV Smart Router
```
**Key Points:**
- Set `DYN_ROUTER_MODE=kv` on the **Frontend** service only
- Workers automatically report KV cache events to the router
- No worker-side configuration changes needed
#### Environment Variables
All CLI arguments can be configured via environment variables using the `DYN_` prefix:
| CLI Argument | Environment Variable | Default |
|--------------|---------------------|---------|
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
For A/B testing and advanced K8s setup, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
For more configuration options and tuning guidelines, see the [Router Guide](router-guide.md). For all CLI arguments, environment variables, K8s deployment examples, and tuning guidelines, see the [Router Guide](router-guide.md). For A/B benchmarking, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
## Prerequisites and Limitations ## Prerequisites and Limitations
...@@ -99,4 +46,5 @@ For basic model registration without KV routing, use `--router-mode round-robin` ...@@ -99,4 +46,5 @@ For basic model registration without KV routing, use `--router-mode round-robin`
- **[Router Guide](router-guide.md)**: Deep dive into KV cache routing, configuration, disaggregated serving, and tuning - **[Router Guide](router-guide.md)**: Deep dive into KV cache routing, configuration, disaggregated serving, and tuning
- **[Router Examples](router-examples.md)**: Python API usage, K8s examples, and custom routing patterns - **[Router Examples](router-examples.md)**: Python API usage, K8s examples, and custom routing patterns
- **[Standalone Indexer](standalone-indexer.md)**: Run the KV indexer as a separate service for independent scaling
- **[Router Design](../../design-docs/router-design.md)**: Architecture details, algorithms, and event transport modes - **[Router Design](../../design-docs/router-design.md)**: Architecture details, algorithms, and event transport modes
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: Agent Hints
subtitle: Per-request hints for scheduling, load balancing, and KV cache optimization
---
Agent hints are optional per-request hints passed via the `nvext.agent_hints` field in the request body. They allow the calling agent or application to communicate request-level metadata that the router uses to improve scheduling, load balancing, and KV cache utilization.
```json
{
"nvext": {
"agent_hints": {
"latency_sensitivity": 5.0,
"osl": 512,
"speculative_prefill": true
}
}
}
```
All three fields are optional and independent — you can use any combination.
## `latency_sensitivity`
Priority scheduling hint, specified in seconds. When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled.
- **Type**: `f64` (optional)
- **Recommended default**: `1.2` for latency-sensitive agentic requests
- **Requires**: `--router-queue-threshold` to be set
### Example
```json
{
"nvext": {
"agent_hints": {
"latency_sensitivity": 5.0
}
}
}
```
A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if it arrived at `T - 5s`, so it will be scheduled ahead of requests that arrived within the last 5 seconds (unless they have even higher sensitivity).
## `osl`
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional)
- **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
### Example
```json
{
"nvext": {
"agent_hints": {
"osl": 1024
}
}
}
```
If the request is expected to generate ~1024 tokens, providing `osl: 1024` lets the router account for the output-side KV cache growth when balancing load across workers.
## `speculative_prefill`
When set to `true`, the system speculatively prefills the predicted next-turn prompt after the current assistant turn completes. This is designed for multi-turn agentic workloads where the next request's prefix is predictable.
- **Type**: `bool` (optional, defaults to `false`)
- **No additional CLI flags required**; works automatically when the hint is set in the request
### How it works
1. As the assistant response streams, the system accumulates the full response text.
2. Once the response finishes (indicated by `finish_reason`), a background task constructs the next-turn prompt by appending the assistant response to the conversation history (with thinking content stripped by the chat template for non-last assistant turns).
3. The constructed prompt is tokenized and sent through the pipeline as a `max_tokens=1` request to warm the KV cache on a worker.
4. When the actual next request arrives, it benefits from the already-warm KV cache, reducing TTFT.
### Example
```json
{
"nvext": {
"agent_hints": {
"speculative_prefill": true
}
}
}
```
This is most effective for reasoning models in agentic loops, where the conversation grows incrementally and the next turn's prefix (everything up to the new user message) is the same as the current conversation.
## See Also
- **[SGLang for Agentic Workloads](../../backends/sglang/agents.md)**: SGLang engine flags for priority scheduling, eviction policies, and cache pinning
- **[NVIDIA Request Extensions (nvext)](../frontend/nvext.md)**: Full `nvext` field reference including `cache_control`
- **[Router Guide](router-guide.md)**: Full router configuration and CLI arguments
- **[Router Examples](router-examples.md)**: Usage patterns and benchmarking
This diff is collapsed.
...@@ -101,4 +101,4 @@ request arrives. ...@@ -101,4 +101,4 @@ request arrives.
4. The KV router routes the speculative request to the same worker, warming its cache. 4. The KV router routes the speculative request to the same worker, warming its cache.
5. When the real next-turn request arrives, the KV router sees high cache overlap on that worker and routes there, yielding a much lower TTFT. 5. When the real next-turn request arrives, the KV router sees high cache overlap on that worker and routes there, yielding a much lower TTFT.
See also: [Agent Hints documentation](../../../../docs/components/router/agent-hints.md) See also: [Agent Hints documentation](../../../../docs/components/frontend/nvext.md#agent-hints)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment