Unverified Commit 9f3b7b33 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(kv-router): deduplicate KvRouterConfig args into shared ArgGroup (#6805)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 0da960f1
......@@ -3,6 +3,12 @@
"""ArgGroup implementations for different configuration domains."""
from .kv_router_args import KvRouterArgGroup, KvRouterConfigBase
from .runtime_args import DynamoRuntimeArgGroup, DynamoRuntimeConfig
__all__ = ["DynamoRuntimeArgGroup", "DynamoRuntimeConfig"]
__all__ = [
"DynamoRuntimeArgGroup",
"DynamoRuntimeConfig",
"KvRouterArgGroup",
"KvRouterConfigBase",
]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Dynamo standalone router configuration ArgGroup."""
"""Shared KV router configuration ArgGroup.
Defines the 16 KvRouterConfig parameters once so that both
``dynamo.frontend`` and ``dynamo.router`` can reuse them without duplication.
Field names on ``KvRouterConfigBase`` match the ``KvRouterConfig`` Python
constructor kwargs 1:1, so ``kv_router_kwargs()`` returns a dict that can be
unpacked directly into ``KvRouterConfig(**config.kv_router_kwargs())``.
"""
from typing import Optional
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
# Authoritative field list — used by kv_router_kwargs() to extract values.
_KV_ROUTER_FIELDS: tuple[str, ...] = (
"overlap_score_weight",
"router_temperature",
"use_kv_events",
"durable_kv_events",
"router_replica_sync",
"router_track_active_blocks",
"router_track_output_blocks",
"router_assume_kv_reuse",
"router_snapshot_threshold",
"router_reset_states",
"router_ttl_secs",
"router_max_tree_size",
"router_prune_target_ratio",
"router_queue_threshold",
"router_event_threads",
"router_enable_cache_control",
)
class DynamoRouterConfig(ConfigBase):
"""Typed configuration for the standalone KV router (router-owned options only)."""
class KvRouterConfigBase(ConfigBase):
"""Mixin carrying the 16 KvRouterConfig fields."""
namespace: str
endpoint: str
router_block_size: int
router_kv_overlap_score_weight: float
overlap_score_weight: float
router_temperature: float
router_use_kv_events: bool
use_kv_events: bool
durable_kv_events: bool
router_replica_sync: bool
router_snapshot_threshold: int
router_reset_states: bool
router_durable_kv_events: bool
router_track_active_blocks: bool
router_assume_kv_reuse: bool
router_track_output_blocks: bool
router_assume_kv_reuse: bool
router_snapshot_threshold: int
router_reset_states: bool
router_ttl_secs: float
router_max_tree_size: int
router_prune_target_ratio: float
router_queue_threshold: Optional[float]
router_event_threads: int
router_enable_cache_control: bool
def validate(self) -> None:
"""Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
if not self.endpoint:
raise ValueError(
"endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
)
parts = self.endpoint.split(".")
if len(parts) != 3:
raise ValueError(
f"Invalid endpoint format: {self.endpoint!r}. "
"Expected format: namespace.component.endpoint"
)
self.namespace = parts[0]
def kv_router_kwargs(self) -> dict:
"""Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
return {f: getattr(self, f) for f in _KV_ROUTER_FIELDS}
class DynamoRouterArgGroup(ArgGroup):
"""CLI argument group for standalone router options."""
name = "dynamo-router"
class KvRouterArgGroup(ArgGroup):
"""CLI arguments for the 16 KvRouterConfig parameters."""
def add_arguments(self, parser) -> None:
"""Add router-owned arguments to parser."""
g = parser.add_argument_group("Dynamo Router Options")
add_argument(
g,
flag_name="--endpoint",
env_var="DYN_ROUTER_ENDPOINT",
default=None,
help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
arg_type=str,
)
add_argument(
g,
flag_name="--router-block-size",
env_var="DYN_ROUTER_BLOCK_SIZE",
default=128,
help="KV cache block size for routing decisions",
arg_type=int,
obsolete_flag="--block-size",
)
g = parser.add_argument_group("KV Router Options")
add_argument(
g,
flag_name="--router-kv-overlap-score-weight",
env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0,
help="KV Router: Weight for overlap score in worker selection. Higher values prioritize KV cache reuse",
help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float,
dest="overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
)
add_argument(
g,
flag_name="--router-temperature",
env_var="DYN_ROUTER_TEMPERATURE",
default=0.0,
help="KV Router: Temperature for worker sampling via softmax. Higher values promote more randomness, and 0 fallbacks to deterministic.",
help=(
"KV Router: Temperature for worker sampling via softmax. Higher values "
"promote more randomness, and 0 fallbacks to deterministic."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--router-kv-events",
env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True,
help="KV Router: Enable KV events from workers. When disabled (--no-router-kv-events), the router predicts cache state based on routing decisions with TTL-based expiration and pruning, rather than receiving events from workers.",
dest="router_use_kv_events",
help=(
"KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
obsolete_flag="--kv-events",
)
add_negatable_bool_argument(
g,
flag_name="--router-replica-sync",
env_var="DYN_ROUTER_REPLICA_SYNC",
default=False,
help="KV Router: Enable replica synchronization across multiple router instances. When true, routers will publish and subscribe to events to maintain consistent state.",
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help="KV Router: Number of messages in stream before triggering a snapshot",
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False,
help="KV Router: Reset router state on startup, purging stream and object store. WARNING: Can affect existing router replicas.",
help=(
"[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
"This option will be removed in a future release. The event-plane subscriber "
"(local_indexer mode) is now the recommended path."
),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
)
add_negatable_bool_argument(
g,
flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
flag_name="--router-replica-sync",
env_var="DYN_ROUTER_REPLICA_SYNC",
default=False,
help="[Deprecated] KV Router: Enable durable KV events using NATS JetStream. This option will be removed in a future release. The event-plane subscriber (local_indexer mode) is now the recommended path.",
obsolete_flag="--durable-kv-events",
help=(
"KV Router: Enable replica synchronization across multiple router instances. "
"When true, routers will publish and subscribe to events to maintain "
"consistent state."
),
)
add_negatable_bool_argument(
g,
flag_name="--router-track-active-blocks",
env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True,
help="KV Router: Track active blocks for load balancing. Use --no-router-track-active-blocks to disable",
dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing."
),
obsolete_flag="--track-active-blocks",
)
add_negatable_bool_argument(
g,
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
help=(
"KV Router: Track output blocks during generation. When enabled, the router adds "
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected output sequence length."
),
obsolete_flag="--track-output-blocks",
)
add_negatable_bool_argument(
g,
flag_name="--router-assume-kv-reuse",
env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True,
help="KV Router: When tracking active blocks, assume KV cache reuse. Use --no-router-assume-kv-reuse to use random hashes, useful when KV cache reuse is not expected.",
dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-router-assume-kv-reuse to generate random hashes instead "
"(when KV cache reuse is not expected)."
),
obsolete_flag="--assume-kv-reuse",
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help="KV Router: Number of messages in stream before triggering a snapshot.",
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
default=False,
help="KV Router: Track output blocks during generation. When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward expected output sequence length (agent_hints.osl in nvext).",
obsolete_flag="--track-output-blocks",
help=(
"KV Router: Reset router state on startup, purging stream and object store. "
"WARNING: This can affect existing router replicas."
),
)
add_argument(
g,
flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL_SECS",
default=120.0,
help="KV Router: TTL for blocks in seconds. Only used when --no-router-kv-events is set. Controls how long cached blocks are considered valid without explicit events.",
help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-max-tree-size",
env_var="DYN_ROUTER_MAX_TREE_SIZE",
default=2**20,
help="KV Router: Maximum tree size before pruning. Only used when --no-router-kv-events is set. When the indexer tree exceeds this size, pruning is triggered.",
help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-prune-target-ratio",
env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
default=0.8,
help="KV Router: Target size ratio after pruning (0.0-1.0). Only used when --no-router-kv-events is set. Determines how aggressively to prune the tree.",
help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-queue-threshold",
env_var="DYN_ROUTER_QUEUE_THRESHOLD",
default=None,
help=(
"KV Router: Queue threshold fraction for prefill token capacity. "
"When set, requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Must be > 0. If not set, queueing is disabled."
),
arg_type=float,
)
add_argument(
g,
flag_name="--router-event-threads",
env_var="DYN_ROUTER_EVENT_THREADS",
default=4,
help="KV Router: Number of event processing threads. >1 uses concurrent radix tree and thread pool for higher throughput. Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning).",
help=(
"KV Router: Number of event processing threads. When > 1, uses a concurrent "
"radix tree with a thread pool for higher throughput. Ignored when "
"--no-router-kv-events is set."
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--enable-cache-control",
env_var="DYN_ENABLE_CACHE_CONTROL",
default=False,
dest="router_enable_cache_control",
help=(
"KV Router: Enable cache control (PIN with TTL). When set, the router creates "
"a cache_control service mesh client and fires pin_prefix after generation for "
"requests with nvext.cache_control."
),
)
......@@ -8,7 +8,10 @@ from typing import Any, Dict, Optional
from dynamo.common.config_dump import register_encoder
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.config_base import ConfigBase
from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import (
add_argument,
add_negatable_bool_argument,
......@@ -36,7 +39,7 @@ def validate_model_path(value: str) -> str:
return value
class FrontendConfig(ConfigBase):
class FrontendConfig(KvRouterConfigBase):
"""Configuration for the Dynamo frontend."""
interactive: bool
......@@ -47,24 +50,8 @@ class FrontendConfig(ConfigBase):
tls_key_path: Optional[pathlib.Path]
router_mode: str
kv_overlap_score_weight: float
router_temperature: float
use_kv_events: bool
router_ttl: float
router_max_tree_size: int
router_prune_target_ratio: float
namespace: Optional[str] = None
namespace_prefix: Optional[str] = None
router_replica_sync: bool
router_snapshot_threshold: int
router_reset_states: bool
durable_kv_events: bool
router_track_active_blocks: bool
router_assume_kv_reuse: bool
router_track_output_blocks: bool
router_event_threads: int
router_queue_threshold: Optional[float]
router_enable_cache_control: bool
decode_fallback: bool
migration_limit: int
......@@ -186,78 +173,9 @@ class FrontendArgGroup(ArgGroup):
help="How to route the request.",
choices=["round-robin", "random", "kv", "direct"],
)
add_argument(
g,
flag_name="--router-kv-overlap-score-weight",
env_var="DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT",
default=1.0,
help=(
"KV Router: Weight for overlap score in worker selection. "
"Higher values prioritize KV cache reuse."
),
arg_type=float,
dest="kv_overlap_score_weight",
obsolete_flag="--kv-overlap-score-weight",
)
add_argument(
g,
flag_name="--router-temperature",
env_var="DYN_ROUTER_TEMPERATURE",
default=0.0,
help=(
"KV Router: Temperature for worker sampling via softmax. Higher values "
"promote more randomness, and 0 fallbacks to deterministic."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--router-kv-events",
env_var="DYN_ROUTER_USE_KV_EVENTS",
default=True,
help=(
"KV Router: Enable/disable KV events. Use --router-kv-events to enable "
"(default, router receives cache state events from workers) or --no-router-kv-events "
"to disable (router predicts cache state based on routing decisions)."
),
dest="use_kv_events",
obsolete_flag="--kv-events",
)
add_argument(
g,
flag_name="--router-ttl-secs",
env_var="DYN_ROUTER_TTL_SECS",
default=120.0,
help=(
"KV Router: Time-to-live in seconds for blocks when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
dest="router_ttl",
obsolete_flag="--router-ttl",
)
add_argument(
g,
flag_name="--router-max-tree-size",
env_var="DYN_ROUTER_MAX_TREE_SIZE",
default=2**20,
help=(
"KV Router: Maximum tree size before pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-prune-target-ratio",
env_var="DYN_ROUTER_PRUNE_TARGET_RATIO",
default=0.8,
help=(
"KV Router: Target size ratio after pruning when KV events are disabled. "
"Only used when --no-router-kv-events is set."
),
arg_type=float,
)
# KV router options (shared with dynamo.router)
KvRouterArgGroup().add_arguments(parser)
add_argument(
g,
......@@ -271,124 +189,6 @@ class FrontendArgGroup(ArgGroup):
),
)
add_negatable_bool_argument(
g,
flag_name="--router-replica-sync",
env_var="DYN_ROUTER_REPLICA_SYNC",
default=False,
help=(
"KV Router: Enable replica synchronization across multiple router instances. "
"When true, routers will publish and subscribe to events to maintain "
"consistent state."
),
)
add_argument(
g,
flag_name="--router-snapshot-threshold",
env_var="DYN_ROUTER_SNAPSHOT_THRESHOLD",
default=1000000,
help=(
"KV Router: Number of messages in stream before triggering a snapshot. "
),
arg_type=int,
)
add_negatable_bool_argument(
g,
flag_name="--router-reset-states",
env_var="DYN_ROUTER_RESET_STATES",
default=False,
help=(
"KV Router: Reset router state on startup, purging stream and object store. "
"By default, states are persisted. WARNING: This can affect existing router "
"replicas."
),
)
add_negatable_bool_argument(
g,
flag_name="--router-durable-kv-events",
env_var="DYN_ROUTER_DURABLE_KV_EVENTS",
default=False,
help=(
"[Deprecated] KV Router: Enable durable KV events using NATS JetStream. "
"This option will be removed in a future release. The event-plane subscriber "
"(local_indexer mode) is now the recommended path."
),
dest="durable_kv_events",
obsolete_flag="--durable-kv-events",
)
add_negatable_bool_argument(
g,
flag_name="--router-track-active-blocks",
env_var="DYN_ROUTER_TRACK_ACTIVE_BLOCKS",
default=True,
dest="router_track_active_blocks",
help=(
"KV Router: Track active blocks (blocks being used for ongoing generation). "
"By default, active blocks are tracked for load balancing. "
),
obsolete_flag="--track-active-blocks",
)
add_negatable_bool_argument(
g,
flag_name="--router-assume-kv-reuse",
env_var="DYN_ROUTER_ASSUME_KV_REUSE",
default=True,
dest="router_assume_kv_reuse",
help=(
"KV Router: When tracking active blocks, assume KV cache reuse. "
"Use --no-router-assume-kv-reuse to generate random hashes instead (when KV cache reuse is not expected)."
),
obsolete_flag="--assume-kv-reuse",
)
add_negatable_bool_argument(
g,
flag_name="--router-track-output-blocks",
env_var="DYN_ROUTER_TRACK_OUTPUT_BLOCKS",
default=False,
dest="router_track_output_blocks",
help=(
"KV Router: Track output blocks during generation. When enabled, the router adds "
"placeholder blocks as tokens are generated and applies fractional decay based on "
"progress toward expected_output_tokens."
),
obsolete_flag="--track-output-blocks",
)
add_argument(
g,
flag_name="--router-event-threads",
env_var="DYN_ROUTER_EVENT_THREADS",
default=4,
help=(
"KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput. "
"Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning)."
),
arg_type=int,
)
add_argument(
g,
flag_name="--router-queue-threshold",
env_var="DYN_ROUTER_QUEUE_THRESHOLD",
default=None,
help=(
"KV Router: Queue threshold fraction for prefill token capacity. "
"When set, requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Enables priority scheduling via latency_sensitivity "
"hints. Must be > 0. If not set, queueing is disabled."
),
arg_type=float,
)
add_negatable_bool_argument(
g,
flag_name="--enable-cache-control",
env_var="DYN_ENABLE_CACHE_CONTROL",
default=False,
dest="router_enable_cache_control",
help=(
"KV Router: Enable cache control (PIN with TTL). When set, the router creates "
"a cache_control service mesh client and fires pin_prefix after generation for "
"requests with nvext.cache_control. Requires --router-mode=kv."
),
)
add_negatable_bool_argument(
g,
flag_name="--decode-fallback",
......
......@@ -177,24 +177,7 @@ async def async_main():
if config.router_mode == "kv":
router_mode = RouterMode.KV
kv_router_config = KvRouterConfig(
overlap_score_weight=config.kv_overlap_score_weight,
router_temperature=config.router_temperature,
use_kv_events=config.use_kv_events,
durable_kv_events=config.durable_kv_events,
router_replica_sync=config.router_replica_sync,
router_track_active_blocks=config.router_track_active_blocks,
router_track_output_blocks=config.router_track_output_blocks,
router_assume_kv_reuse=config.router_assume_kv_reuse,
router_snapshot_threshold=config.router_snapshot_threshold,
router_reset_states=config.router_reset_states,
router_ttl_secs=config.router_ttl,
router_max_tree_size=config.router_max_tree_size,
router_prune_target_ratio=config.router_prune_target_ratio,
router_queue_threshold=config.router_queue_threshold,
router_event_threads=config.router_event_threads,
router_enable_cache_control=config.router_enable_cache_control,
)
kv_router_config = KvRouterConfig(**config.kv_router_kwargs())
elif config.router_mode == "random":
router_mode = RouterMode.Random
kv_router_config = None
......
......@@ -19,9 +19,8 @@ from typing import Optional
import uvloop
from dynamo.llm import KvRouter, KvRouterConfig
from dynamo.router.args import build_kv_router_config
from dynamo.router.args import DynamoRouterConfig, build_kv_router_config
from dynamo.router.args import parse_args as parse_router_args
from dynamo.router.backend_args import DynamoRouterConfig
from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -163,10 +162,10 @@ async def worker(runtime: DistributedRuntime):
logger.info("Starting Standalone Router Service")
logger.debug(
f"Configuration: endpoint={config.endpoint}, router_block_size={config.router_block_size}, "
f"overlap_score_weight={config.router_kv_overlap_score_weight}, "
f"overlap_score_weight={config.overlap_score_weight}, "
f"router_temperature={config.router_temperature}, "
f"router_use_kv_events={config.router_use_kv_events}, "
f"router_durable_kv_events={config.router_durable_kv_events}, "
f"use_kv_events={config.use_kv_events}, "
f"durable_kv_events={config.durable_kv_events}, "
f"router_replica_sync={config.router_replica_sync}, "
f"router_reset_states={config.router_reset_states}, "
f"router_track_active_blocks={config.router_track_active_blocks}, "
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Router CLI parsing and config assembly."""
"""Router CLI parsing, config, and assembly for the standalone router."""
import argparse
from dynamo.common.configuration.arg_group import ArgGroup
from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup,
KvRouterConfigBase,
)
from dynamo.common.configuration.utils import add_argument
from dynamo.llm import KvRouterConfig
from .backend_args import DynamoRouterArgGroup, DynamoRouterConfig
class DynamoRouterConfig(KvRouterConfigBase):
"""Typed configuration for the standalone KV router (router-owned options only)."""
def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
"""Build KvRouterConfig from DynamoRouterConfig.
namespace: str
endpoint: str
router_block_size: int
Maps CLI/config attribute names to KvRouterConfig constructor kwargs.
The only name difference is router_kv_overlap_score_weight -> overlap_score_weight.
"""
return KvRouterConfig(
overlap_score_weight=router_config.router_kv_overlap_score_weight,
router_temperature=router_config.router_temperature,
use_kv_events=router_config.router_use_kv_events,
durable_kv_events=router_config.router_durable_kv_events,
router_replica_sync=router_config.router_replica_sync,
router_track_active_blocks=router_config.router_track_active_blocks,
router_track_output_blocks=router_config.router_track_output_blocks,
router_assume_kv_reuse=router_config.router_assume_kv_reuse,
router_snapshot_threshold=router_config.router_snapshot_threshold,
router_reset_states=router_config.router_reset_states,
router_ttl_secs=router_config.router_ttl_secs,
router_max_tree_size=router_config.router_max_tree_size,
router_prune_target_ratio=router_config.router_prune_target_ratio,
router_event_threads=router_config.router_event_threads,
)
def validate(self) -> None:
"""Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
if not self.endpoint:
raise ValueError(
"endpoint is required (set --endpoint or DYN_ROUTER_ENDPOINT)"
)
parts = self.endpoint.split(".")
if len(parts) != 3:
raise ValueError(
f"Invalid endpoint format: {self.endpoint!r}. "
"Expected format: namespace.component.endpoint"
)
self.namespace = parts[0]
class DynamoRouterArgGroup(ArgGroup):
"""CLI argument group for standalone router options."""
name = "dynamo-router"
def add_arguments(self, parser) -> None:
"""Add router-owned arguments to parser."""
g = parser.add_argument_group("Dynamo Router Options")
add_argument(
g,
flag_name="--endpoint",
env_var="DYN_ROUTER_ENDPOINT",
default=None,
help="Full endpoint path for workers in the format namespace.component.endpoint (e.g., dynamo.prefill.generate for prefill workers)",
arg_type=str,
)
add_argument(
g,
flag_name="--router-block-size",
env_var="DYN_ROUTER_BLOCK_SIZE",
default=128,
help="KV cache block size for routing decisions",
arg_type=int,
obsolete_flag="--block-size",
)
# KV router options (shared with dynamo.frontend)
KvRouterArgGroup().add_arguments(parser)
def build_kv_router_config(router_config: DynamoRouterConfig) -> KvRouterConfig:
"""Build KvRouterConfig from DynamoRouterConfig."""
return KvRouterConfig(**router_config.kv_router_kwargs())
def parse_args(argv=None) -> DynamoRouterConfig:
......
......@@ -7,7 +7,7 @@ subtitle: Priority scheduling, KV cache eviction policies, and cache pinning for
# SGLang for Agentic Workloads
This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/router/agent-hints.md) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
This guide covers SGLang-specific configuration for agentic serving with Dynamo. It explains which SGLang engine flags to enable, how Dynamo's [agent hints](../../components/frontend/nvext.md#agent-hints) map to SGLang behavior, and how to use experimental cache pinning to protect KV cache for high-value conversations.
## Overview
......@@ -301,7 +301,6 @@ A high `cached_tokens / prompt_tokens` ratio on subsequent turns confirms that t
## See Also
- **[Agent Hints](../../components/router/agent-hints.md)**: Per-request hint reference
- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference
- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints
- **[Router Guide](../../components/router/router-guide.md)**: Router configuration and CLI arguments
- **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Enabling hierarchical KV cache
......@@ -783,7 +783,7 @@ VllmPrefillWorker:
## Conclusion
This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit.
This guide provides a complete methodology for A/B testing Dynamo's KV Smart Router. The KV router's effectiveness depends heavily on workload characteristics—datasets with high prefix overlap will show the most benefit. For further details on tuning the KV router, see the [Tuning Guidelines](../components/router/router-guide.md#tuning-guidelines).
For questions or issues, consult the [Dynamo documentation](https://github.com/ai-dynamo/dynamo) or open an issue on GitHub.
......
......@@ -64,7 +64,7 @@ The `agent_hints` sub-object carries per-request hints that the router uses for
### `latency_sensitivity`
When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled.
When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. A recommended default is `1.2` for latency-sensitive agentic requests. Has no effect when queueing is disabled.
```json
{
......@@ -195,3 +195,4 @@ When the client requests response metadata via `extra_fields`, the response incl
|----------|-------------|
| [Frontend Guide](frontend-guide.md) | KServe gRPC configuration and integration |
| [Router Guide](../router/router-guide.md) | Full router configuration and CLI arguments |
| [SGLang for Agentic Workloads](../../backends/sglang/agents.md) | SGLang engine flags for priority scheduling, eviction policies, and cache pinning |
......@@ -8,75 +8,22 @@ The Dynamo KV Router intelligently routes requests by evaluating their computati
## Quick Start
### Python / CLI Deployment
To launch the Dynamo frontend with the KV Router:
```bash
python -m dynamo.frontend --router-mode kv --http-port 8000
```
This command:
- Launches the Dynamo frontend service with KV routing enabled
- Exposes the service on port 8000 (configurable)
- Automatically handles all backend workers registered to the Dynamo endpoint
Backend workers register themselves using the `register_model` API, after which the KV Router automatically tracks worker state and makes routing decisions based on KV cache overlap.
#### CLI Arguments
For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automatically report KV cache events — no worker-side configuration changes needed.
| Argument | Default | Description |
|----------|---------|-------------|
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-temperature <float>` | `0.0` | Controls routing randomness (0.0 = deterministic, higher = more random) |
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
For all available options: `python -m dynamo.frontend --help`
### Kubernetes Deployment
To enable the KV Router in Kubernetes, add the `DYN_ROUTER_MODE` environment variable to your frontend service:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-deployment
spec:
services:
Frontend:
dynamoNamespace: my-namespace
componentType: frontend
replicas: 1
envs:
- name: DYN_ROUTER_MODE
value: kv # Enable KV Smart Router
```
**Key Points:**
- Set `DYN_ROUTER_MODE=kv` on the **Frontend** service only
- Workers automatically report KV cache events to the router
- No worker-side configuration changes needed
#### Environment Variables
All CLI arguments can be configured via environment variables using the `DYN_` prefix:
| CLI Argument | Environment Variable | Default |
|--------------|---------------------|---------|
| `--router-mode kv` | `DYN_ROUTER_MODE=kv` | `round_robin` |
| `--router-temperature` | `DYN_ROUTER_TEMPERATURE` | `0.0` |
| `--kv-cache-block-size` | `DYN_KV_CACHE_BLOCK_SIZE` | Backend-specific |
| `--no-router-kv-events` | `DYN_ROUTER_USE_KV_EVENTS=false` | `true` |
| `--router-kv-overlap-score-weight` | `DYN_ROUTER_KV_OVERLAP_SCORE_WEIGHT` | `1.0` |
For complete K8s examples and advanced configuration, see [K8s Examples](router-examples.md#k8s-examples).
For A/B testing and advanced K8s setup, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
| `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
| `--router-queue-threshold` | disabled | Enable backpressure queue under high concurrency; also enables priority scheduling via `nvext.agent_hints.latency_sensitivity` |
For more configuration options and tuning guidelines, see the [Router Guide](router-guide.md).
For all CLI arguments, environment variables, K8s deployment examples, and tuning guidelines, see the [Router Guide](router-guide.md). For A/B benchmarking, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
## Prerequisites and Limitations
......@@ -99,4 +46,5 @@ For basic model registration without KV routing, use `--router-mode round-robin`
- **[Router Guide](router-guide.md)**: Deep dive into KV cache routing, configuration, disaggregated serving, and tuning
- **[Router Examples](router-examples.md)**: Python API usage, K8s examples, and custom routing patterns
- **[Standalone Indexer](standalone-indexer.md)**: Run the KV indexer as a separate service for independent scaling
- **[Router Design](../../design-docs/router-design.md)**: Architecture details, algorithms, and event transport modes
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: Agent Hints
subtitle: Per-request hints for scheduling, load balancing, and KV cache optimization
---
Agent hints are optional per-request hints passed via the `nvext.agent_hints` field in the request body. They allow the calling agent or application to communicate request-level metadata that the router uses to improve scheduling, load balancing, and KV cache utilization.
```json
{
"nvext": {
"agent_hints": {
"latency_sensitivity": 5.0,
"osl": 512,
"speculative_prefill": true
}
}
}
```
All three fields are optional and independent — you can use any combination.
## `latency_sensitivity`
Priority scheduling hint, specified in seconds. When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. Has no effect when queueing is disabled.
- **Type**: `f64` (optional)
- **Recommended default**: `1.2` for latency-sensitive agentic requests
- **Requires**: `--router-queue-threshold` to be set
### Example
```json
{
"nvext": {
"agent_hints": {
"latency_sensitivity": 5.0
}
}
}
```
A request with `latency_sensitivity: 5.0` arriving at time `T` is treated as if it arrived at `T - 5s`, so it will be scheduled ahead of requests that arrived within the last 5 seconds (unless they have even higher sensitivity).
## `osl`
Expected output sequence length — the estimated number of output tokens the request will generate. The router uses this hint in two ways:
1. **Output block tracking**: When output block tracking is enabled (`--router-track-output-blocks`), the router adds placeholder blocks during generation and applies fractional decay based on progress toward `osl`. This gives the router a more accurate picture of each worker's KV cache utilization for long-running requests.
2. **Resource estimation**: Helps the router estimate total resource requirements when making routing decisions.
- **Type**: `u32` (optional)
- **Requires**: `--router-track-output-blocks` (frontend or standalone router) for output block tracking behavior
### Example
```json
{
"nvext": {
"agent_hints": {
"osl": 1024
}
}
}
```
If the request is expected to generate ~1024 tokens, providing `osl: 1024` lets the router account for the output-side KV cache growth when balancing load across workers.
## `speculative_prefill`
When set to `true`, the system speculatively prefills the predicted next-turn prompt after the current assistant turn completes. This is designed for multi-turn agentic workloads where the next request's prefix is predictable.
- **Type**: `bool` (optional, defaults to `false`)
- **No additional CLI flags required**; works automatically when the hint is set in the request
### How it works
1. As the assistant response streams, the system accumulates the full response text.
2. Once the response finishes (indicated by `finish_reason`), a background task constructs the next-turn prompt by appending the assistant response to the conversation history (with thinking content stripped by the chat template for non-last assistant turns).
3. The constructed prompt is tokenized and sent through the pipeline as a `max_tokens=1` request to warm the KV cache on a worker.
4. When the actual next request arrives, it benefits from the already-warm KV cache, reducing TTFT.
### Example
```json
{
"nvext": {
"agent_hints": {
"speculative_prefill": true
}
}
}
```
This is most effective for reasoning models in agentic loops, where the conversation grows incrementally and the next turn's prefix (everything up to the new user message) is the same as the current conversation.
## See Also
- **[SGLang for Agentic Workloads](../../backends/sglang/agents.md)**: SGLang engine flags for priority scheduling, eviction policies, and cache pinning
- **[NVIDIA Request Extensions (nvext)](../frontend/nvext.md)**: Full `nvext` field reference including `cache_control`
- **[Router Guide](router-guide.md)**: Full router configuration and CLI arguments
- **[Router Examples](router-examples.md)**: Usage patterns and benchmarking
......@@ -40,7 +40,7 @@ Backend workers register themselves using the `register_model` API, after which
For all available options: `python -m dynamo.frontend --help`
For detailed configuration options and tuning parameters, see [Using the KV Cache Router](#using-the-kv-cache-router).
For detailed configuration options and tuning parameters, see [Advanced Router Usage](#advanced-router-usage).
### Kubernetes Deployment
......@@ -140,130 +140,87 @@ When KV blocks are created or removed, the engine notifies the Dynamo router, wh
To evaluate the benefits of KV-aware routing, compare your workload's performance using `--router-mode random|round-robin` against KV-aware routing.
The main KV-aware routing arguments (frontend uses the same `--router-*` flag names as the standalone router; legacy names without the prefix are obsolete):
- `--router-kv-overlap-score-weight`: Controls the importance of prefix cache overlaps in prefill cost calculations. Higher values improve Time To First Token (TTFT) at the cost of Inter-Token Latency (ITL). When set to 0, the router ignores prefix caches and uses pure load balancing. Defaults to 1.
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--no-router-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
- `--router-durable-kv-events`: **(Deprecated — will be removed in a future release.)** Enables JetStream mode for KV event transport. The event-plane subscriber (local_indexer mode) is now the recommended path. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane.
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
- `--router-reset-states`: Only applies in JetStream mode (`--router-durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-snapshot-threshold`: Only applies in JetStream mode (`--router-durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--no-router-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management.
- `--router-track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward the expected output sequence length (`agent_hints.osl` in nvext). This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth.
- `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
For detailed CLI arguments and advanced configuration options, see [Advanced Router Usage](#advanced-router-usage).
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity. When set, the router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `latency_sensitivity` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. If not set (default), queueing is disabled and requests are dispatched immediately.
### Basic Routing
- `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines publish load metrics. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
Dynamo supports several routing strategies when sending requests from one component to another component's endpoint.
- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.
First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component.
- `--active-prefill-tokens-threshold-frac`: Fraction of `max_num_batched_tokens` for busy detection. A worker is marked busy when `active_prefill_tokens > frac * max_num_batched_tokens`. Uses OR logic with `--active-prefill-tokens-threshold` (worker is busy if either threshold is exceeded). If not set, fractional busy detection is disabled.
```python
client = runtime.endpoint("dynamo.VllmWorker.generate").client()
```
- `--router-ttl-secs`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-router-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component.
- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-router-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
- **Random routing**: Default strategy, available via `client.generate()` or `client.random()`
- **Round-robin routing**: Cycles through available workers via `client.round_robin()`
- **Direct routing**: Explicitly targets a specific worker via `client.direct(input, component_id)`
- `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-router-kv-events` is used. This creates headroom before the next pruning cycle.
KV Cache routing uses direct routing with a special worker selection algorithm.
- `--router-event-threads`: Number of event processing threads for the KV indexer (default: 4). When set to 1, the router uses a single-threaded radix tree with channel-based event processing. When set to a value greater than 1 (the default), the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. This setting only applies when KV events are enabled (the default). When `--no-router-kv-events` is set (approximate mode), the router always uses a single-threaded indexer with TTL-based expiration and pruning regardless of this setting. Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../lib/kv-router/README.md).
For benchmarking KV router performance, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
<Note>
For custom routing logic and advanced patterns, see [Routing Patterns](router-examples.md#routing-patterns) in the examples documentation.
**State persistence** depends on the event transport mode:
- **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes.
- **JetStream mode** (`--router-durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots.
- **No KV events** (`--no-router-kv-events`): State persistence is not supported.
## Advanced Router Usage
**Request plane is independent of KV event transport.**
The request plane (`DYN_REQUEST_PLANE` / `--request-plane`) controls how requests reach workers (TCP/HTTP/NATS), while KV events travel over a separate path. KV events use NATS in JetStream or NATS Core modes, or ZMQ when `--event-plane zmq` is set. With `--event-plane zmq` and `--discovery-backend file` or `mem`, the router can run entirely without etcd or NATS. When using a NATS-based event plane (the default), NATS is initialized automatically; set `NATS_SERVER=nats://...` to override the default `localhost:4222`. Use `--no-router-kv-events` to disable KV event transport entirely.
The main KV-aware routing arguments (frontend uses the same `--router-*` flag names as the standalone router; legacy names without the prefix are obsolete):
When `--router-kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-router-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning.
### Routing Behavior
**Backend Configuration:** When using `--no-router-kv-events`, no additional backend flags are needed — SGLang and TRT-LLM disable KV events by default. For vLLM, KV events are currently enabled by default when prefix caching is active (deprecated — will change in a future release). Use `--kv-events-config` explicitly to control behavior:
- **vLLM**: Use `--kv-events-config '{"enable_kv_cache_events": false}'` to disable, or omit (auto-enabled, deprecated)
- **SGLang**: Do not use `--kv-events-config`
- **TRT-LLM**: Do not use `--publish-events-and-metrics`
- `--router-kv-overlap-score-weight`: Controls the importance of prefix cache overlaps in prefill cost calculations. Higher values improve Time To First Token (TTFT) at the cost of Inter-Token Latency (ITL). When set to 0, the router ignores prefix caches and uses pure load balancing. Defaults to 1.
The cli args `--router-ttl-secs`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When workers are configured to publish KV events (via `--kv-events-config`), the router relies on worker-side eviction events and these parameters are ignored.
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
**Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity. When set, the router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `latency_sensitivity` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. If not set (default), queueing is disabled and requests are dispatched immediately.
</Note>
### KV Event Transport and Persistence
To implement KV event publishing for custom inference engines, enabling them to participate in Dynamo's KV cache-aware routing, see [KV Event Publishing for Custom Engines](../../integrations/kv-events-custom-engines.md).
- `--no-router-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
For details on per-request agent hints (`latency_sensitivity`, `osl`, `speculative_prefill`), see the [Agent Hints Guide](agent-hints.md).
- `--router-durable-kv-events`: **(Deprecated — will be removed in a future release.)** Enables JetStream mode for KV event transport. The event-plane subscriber (local_indexer mode) is now the recommended path. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane.
## Basic Routing
- `--router-reset-states`: Only applies in JetStream mode (`--router-durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
Dynamo supports several routing strategies when sending requests from one component to another component's endpoint.
- `--router-snapshot-threshold`: Only applies in JetStream mode (`--router-durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
First, we must create a client tied to a components endpoint, we can do this using the labels defined above. Here we are getting a client tied to the `generate` endpoint of the `VllmWorker` component.
### Block Tracking
```python
client = runtime.endpoint("dynamo.VllmWorker.generate").client()
```
- `--no-router-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management.
We can then use the default routing methods exposed by the client class to send requests to the `VllmWorker` component.
- `--router-track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward the expected output sequence length (`agent_hints.osl` in nvext). This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth.
- **Random routing**: Default strategy, available via `client.generate()` or `client.random()`
- **Round-robin routing**: Cycles through available workers via `client.round_robin()`
- **Direct routing**: Explicitly targets a specific worker via `client.direct(input, component_id)`
- `--no-router-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
KV Cache routing uses direct routing with a special worker selection algorithm.
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
For benchmarking KV router performance, see the [KV Router A/B Benchmarking Guide](../../benchmarks/kv-router-ab-testing.md).
### KV Indexer / Approx KV Indexer
For custom routing logic and advanced patterns, see [Routing Patterns](router-examples.md#routing-patterns) in the examples documentation.
- `--router-ttl-secs`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-router-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
## Tuning Guidelines
- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-router-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
### 1. Understand Your Workload Characteristics
- `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-router-kv-events` is used. This creates headroom before the next pruning cycle.
- **Prefill-heavy workloads** (long prompts, short generations): Increase `--router-kv-overlap-score-weight`
- **Decode-heavy workloads** (short prompts, long generations): Decrease `--router-kv-overlap-score-weight`
- `--router-event-threads`: Number of event processing threads for the KV indexer (default: 4). When set to 1, the router uses a single-threaded radix tree with channel-based event processing. When set to a value greater than 1 (the default), the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. This setting only applies when KV events are enabled (the default). When `--no-router-kv-events` is set (approximate mode), the router always uses a single-threaded indexer with TTL-based expiration and pruning regardless of this setting. Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../lib/kv-router/README.md).
### 2. Monitor Key Metrics
To implement KV event publishing for custom inference engines, enabling them to participate in Dynamo's KV cache-aware routing, see [KV Event Publishing for Custom Engines](../../integrations/kv-events-custom-engines.md).
The router logs the cost calculation for each worker:
```text
Formula for worker_1: 125.3 = 1.0 * 100.5 + 25.0 (cached_blocks: 15)
```
For details on per-request agent hints (`latency_sensitivity`, `osl`, `speculative_prefill`), see the [NVIDIA Request Extensions (`nvext`)](../frontend/nvext.md#agent-hints) documentation.
This shows:
- Total cost (125.3)
- Overlap weight × prefill blocks (1.0 × 100.5)
- Active blocks (25.0)
- Cached blocks that contribute to overlap (15)
### Tuning Guidelines
### 3. Temperature-Based Routing
The `--router-kv-overlap-score-weight` parameter is the primary knob for balancing prefill efficiency against decode load. Prefill-heavy workloads (long prompts, short generations) benefit from a higher weight, which steers requests toward workers with better cache overlap and reduces TTFT. Decode-heavy workloads (short prompts, long generations) benefit from a lower weight, which distributes decode load more evenly and reduces ITL. The default of 1.0 is a reasonable starting point; monitor TTFT and ITL metrics and adjust from there. This weight can also be overridden per request via `nvext.agent_hints.kv_overlap_score_weight`, which is useful when different request types in the same deployment have different latency profiles.
The `router_temperature` parameter controls routing randomness:
- **0.0 (default)**: Deterministic selection of the best worker
- **> 0.0**: Probabilistic selection, higher values increase randomness
- Useful for preventing worker saturation and improving load distribution
Use `--no-router-kv-events` when you are not confident that your backend engine emits KV events correctly — for example, with hybrid models or custom engines that haven't been validated for event accuracy. In this mode the router falls back to approximate routing, predicting cache state from its own routing decisions with TTL-based expiration and pruning, rather than relying on real-time block creation/deletion events from workers.
### 4. Iterative Optimization
Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worker does not reuse transferred KV cache blocks. By default the router assumes KV blocks transferred from prefill to decode will be deduplicated on the decode side, but vLLM and SGLang decode workers currently do not support this — only TensorRT-LLM does. Without this flag, the router undercounts decode blocks when duplicates exist, leading to inaccurate load estimates.
1. Begin with default settings
2. Monitor TTFT and ITL metrics
3. Adjust `--router-kv-overlap-score-weight` to meet your performance goals:
- To reduce TTFT: Increase the weight
- To reduce ITL: Decrease the weight
4. If you observe severe load imbalance, increase the temperature setting
Set `--router-queue-threshold` (e.g. `1.5`) to enable backpressure under very high concurrency workloads. When set, the router holds incoming requests in a priority queue while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`.
## Prometheus Metrics
### Prometheus Metrics
The router exposes Prometheus metrics on the frontend's HTTP port (default 8000) at `/metrics`:
......@@ -407,39 +364,26 @@ If you need to start with a fresh state in JetStream mode, you have two options:
</Note>
## Dynamic Threshold Configuration
## Additional Notes
Dynamic threshold configuration allows you to adjust worker busy thresholds at runtime without restarting the frontend, enabling real-time tuning of load balancing behavior based on observed system performance.
**State persistence** depends on the event transport mode:
- **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes.
- **JetStream mode** (`--router-durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots.
- **No KV events** (`--no-router-kv-events`): State persistence is not supported.
The busy thresholds can be updated at runtime without restarting the frontend. The frontend exposes HTTP endpoints at `/busy_threshold`:
**Request plane is independent of KV event transport.**
The request plane (`DYN_REQUEST_PLANE` / `--request-plane`) controls how requests reach workers (TCP/HTTP/NATS), while KV events travel over a separate path. KV events use NATS in JetStream or NATS Core modes, or ZMQ when `--event-plane zmq` is set. With `--event-plane zmq` and `--discovery-backend file` or `mem`, the router can run entirely without etcd or NATS. When using a NATS-based event plane (the default), NATS is initialized automatically; set `NATS_SERVER=nats://...` to override the default `localhost:4222`. Use `--no-router-kv-events` to disable KV event transport entirely.
**Get or set a model's thresholds (POST):**
```bash
# Set both thresholds for a model
curl -X POST http://localhost:8000/busy_threshold \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}'
# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
# Set only active decode blocks threshold
curl -X POST http://localhost:8000/busy_threshold \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85}'
# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": <current_value>}
# Get current thresholds (omit threshold fields)
curl -X POST http://localhost:8000/busy_threshold \
-H "Content-Type: application/json" \
-d '{"model": "meta-llama/Llama-2-7b-hf"}'
# Response: {"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}
# Or if not configured: {"model": "...", "active_decode_blocks_threshold": null, "active_prefill_tokens_threshold": null}
```
When `--router-kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-router-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning.
**List all configured thresholds (GET):**
```bash
curl http://localhost:8000/busy_threshold
# Response: {"thresholds": [{"model": "meta-llama/Llama-2-7b-hf", "active_decode_blocks_threshold": 0.85, "active_prefill_tokens_threshold": 1000}]}
```
**Backend KV event publishing** is independent of the frontend's `--no-router-kv-events` flag. The frontend flag controls whether the router *consumes* events; the backend flags control whether workers *publish* them. If the router is not consuming events, workers that still publish will waste resources but cause no harm. By default, SGLang and TRT-LLM do not publish KV events. vLLM auto-enables publishing when prefix caching is active (deprecated — will default to off in a future release). To explicitly control publishing:
- **vLLM**: Pass `--kv-events-config '{"enable_kv_cache_events": false}'` to disable, or `'{"enable_kv_cache_events": true, "publisher": "zmq", "endpoint": "tcp://*:5557"}'` to enable.
- **SGLang**: Pass `--kv-events-config` with a JSON config to enable; omit to keep disabled.
- **TRT-LLM**: Pass `--publish-events-and-metrics` to enable; omit to keep disabled.
The cli args `--router-ttl-secs`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When workers are configured to publish KV events (via `--kv-events-config`), the router relies on worker-side eviction events and these parameters are ignored.
**Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`. The busy thresholds can be updated at runtime without restarting the frontend via the `/busy_threshold` HTTP endpoint. For details on busy detection, threshold tuning, and the runtime API, see [Request Rejection](../../fault-tolerance/request-rejection.md).
## See Also
......
......@@ -101,4 +101,4 @@ request arrives.
4. The KV router routes the speculative request to the same worker, warming its cache.
5. When the real next-turn request arrives, the KV router sees high cache overlap on that worker and routes there, yielding a much lower TTFT.
See also: [Agent Hints documentation](../../../../docs/components/router/agent-hints.md)
See also: [Agent Hints documentation](../../../../docs/components/frontend/nvext.md#agent-hints)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment