help="KV Router: Enable/disable KV events. Use --kv-events to enable (default, router receives cache state events from workers) or --no-kv-events to disable (router predicts cache state based on routing decisions).",
help="KV Router: Time-to-live in seconds for blocks when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_TTL env var (default: 120.0).",
help="KV Router: Maximum tree size before pruning when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_MAX_TREE_SIZE env var (default: 1048576, which is 2^20).",
help="KV Router: Target size ratio after pruning when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_PRUNE_TARGET_RATIO env var (default: 0.8).",
)
parser.add_argument(
"--namespace",
type=str,
default=os.environ.get(DYN_NAMESPACE_ENV_VAR),
help="Dynamo namespace for model discovery scoping. If specified, models will only be discovered from this namespace. If not specified, discovers models from all namespaces (global discovery).",
)
parser.add_argument(
"--router-replica-sync",
action="store_true",
default=False,
help="KV Router: Enable replica synchronization across multiple router instances. When true, routers will publish and subscribe to events to maintain consistent state.",
)
parser.add_argument(
"--router-snapshot-threshold",
type=int,
default=1000000,
help="KV Router: Number of messages in stream before triggering a snapshot. Defaults to 1000000.",
)
parser.add_argument(
"--router-reset-states",
action="store_true",
dest="router_reset_states",
default=False,
help="KV Router: Reset router state on startup, purging stream and object store. By default, states are persisted. WARNING: This can affect existing router replicas.",
)
parser.add_argument(
"--durable-kv-events",
action="store_true",
dest="durable_kv_events",
default=False,
help="KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. By default, the router uses the generic event plane (NATS Core or ZMQ) with local_indexer mode. Use this flag when you need durability and multi-replica consistency. Requires NATS with JetStream enabled.",
)
parser.add_argument(
"--no-track-active-blocks",
action="store_false",
dest="router_track_active_blocks",
default=True,
help="KV Router: Disable tracking of active blocks (blocks being used for ongoing generation). By default, active blocks are tracked for load balancing.",
)
parser.add_argument(
"--no-assume-kv-reuse",
action="store_false",
dest="router_assume_kv_reuse",
default=True,
help="KV Router: When tracking active blocks, do not assume KV cache reuse (generate random hashes instead of computing actual block hashes). Useful when KV cache reuse is not expected. By default, KV cache reuse is assumed.",
)
parser.add_argument(
"--track-output-blocks",
action="store_true",
dest="router_track_output_blocks",
default=False,
help="KV Router: Track output blocks during generation. When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward expected_output_tokens. By default, output blocks are not tracked.",
help="KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput. Can be set via DYN_ROUTER_EVENT_THREADS env var (default: 1).",
)
parser.add_argument(
"--enforce-disagg",
action="store_true",
default=False,
help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. When > 0, enables request migration on worker disconnect (default: 0).",
)
parser.add_argument(
"--active-decode-blocks-threshold",
type=float,
default=None,
help="Threshold percentage (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. If not set, blocks-based busy detection is disabled.",
)
parser.add_argument(
"--active-prefill-tokens-threshold",
type=int,
default=None,
help="Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.",
)
parser.add_argument(
"--active-prefill-tokens-threshold-frac",
type=float,
default=None,
help="Fraction of max_num_batched_tokens for busy detection. Worker is busy when active_prefill_tokens > frac * max_num_batched_tokens. Default 1.5 (disabled). Uses OR logic with --active-prefill-tokens-threshold.",
)
parser.add_argument(
"--model-name",
type=validate_model_name,
help="Model name as a string (e.g., 'Llama-3.2-1B-Instruct')",
)
parser.add_argument(
"--model-path",
type=validate_model_path,
help="Path to model directory on disk (e.g., /tmp/model_cache/llama3.2_1B/)",
)
parser.add_argument(
"--metrics-prefix",
type=str,
default=None,
help="Prefix for Dynamo frontend metrics. If unset, uses DYN_METRICS_PREFIX env var or 'dynamo_frontend'.",
)
parser.add_argument(
"--kserve-grpc-server",
action="store_true",
default=False,
help="Start KServe gRPC server.",
)
parser.add_argument(
"--grpc-metrics-port",
type=int,
default=8788,
help="HTTP metrics port for gRPC service (u16). Only used with --kserve-grpc-server. Defaults to 8788.",
)
add_config_dump_args(parser)
parser.add_argument(
"--store-kv",
type=str,
choices=["etcd","file","mem"],
default=os.environ.get("DYN_STORE_KV","etcd"),
help="Which key-value backend to use: etcd, mem, file. Etcd uses the ETCD_* env vars (e.g. ETCD_ENDPOINTS) for connection details. File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv.",