Unverified Commit 6783bdca authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore: enable local indexers by default, and use normal event plane by default...


chore: enable local indexers by default, and use normal event plane by default (not jetstream) (#5941)
Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 3d7182b8
......@@ -14,6 +14,6 @@ Submodules:
- prometheus: Prometheus metrics collection and logging utilities
"""
from dynamo.common.utils import endpoint_types, otel_tracing, paths, prometheus
from dynamo.common.utils import endpoint_types, otel_tracing, paths, prometheus, runtime
__all__ = ["endpoint_types", "otel_tracing", "paths", "prometheus"]
__all__ = ["endpoint_types", "otel_tracing", "paths", "prometheus", "runtime"]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Common runtime utilities shared across Dynamo engine backends.
Provides:
- parse_endpoint: Parse 'dyn://namespace.component.endpoint' strings
- graceful_shutdown: Shutdown DistributedRuntime with optional event signaling
- create_runtime: Create DistributedRuntime with signal handlers
"""
import asyncio
import logging
import os
import signal
from typing import Optional, Tuple
from dynamo.runtime import DistributedRuntime
def parse_endpoint(endpoint: str) -> Tuple[str, str, str]:
"""Parse a Dynamo endpoint string into its components.
Args:
endpoint: Endpoint string in format 'namespace.component.endpoint'
or 'dyn://namespace.component.endpoint'.
Returns:
Tuple of (namespace, component, endpoint_name).
Raises:
ValueError: If endpoint format is invalid.
"""
endpoint_str = endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
raise ValueError(
f"Invalid endpoint format: '{endpoint}'. "
"Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
namespace, component, endpoint_name = endpoint_parts
return namespace, component, endpoint_name
async def graceful_shutdown(
runtime: DistributedRuntime,
shutdown_event: Optional[asyncio.Event] = None,
) -> None:
"""Shutdown DistributedRuntime with optional event signaling.
Args:
runtime: The DistributedRuntime instance to shut down.
shutdown_event: Optional event to set before shutting down,
signaling in-flight handlers to finish.
"""
logging.info("Received shutdown signal, shutting down DistributedRuntime")
if shutdown_event is not None:
shutdown_event.set()
runtime.shutdown()
logging.info("DistributedRuntime shutdown complete")
def create_runtime(
store_kv: str,
request_plane: str,
event_plane: str,
use_kv_events: bool,
shutdown_event: Optional[asyncio.Event] = None,
) -> Tuple[DistributedRuntime, asyncio.AbstractEventLoop]:
"""Create a DistributedRuntime and register signal handlers for graceful shutdown.
Sets DYN_EVENT_PLANE in the environment, computes whether NATS is needed,
creates the runtime, and installs SIGTERM/SIGINT handlers.
Args:
store_kv: Key-value backend type (etcd, file, mem).
request_plane: Request distribution method (nats, http, tcp).
event_plane: Event publishing method (nats, zmq).
use_kv_events: Whether KV events are enabled.
shutdown_event: Optional event to set on shutdown signal.
Returns:
Tuple of (runtime, event_loop).
"""
loop = asyncio.get_running_loop()
os.environ["DYN_EVENT_PLANE"] = event_plane
enable_nats = request_plane == "nats" or (event_plane == "nats" and use_kv_events)
runtime = DistributedRuntime(loop, store_kv, request_plane, enable_nats)
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime, shutdown_event))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logging.debug("Signal handlers set up for graceful shutdown")
return runtime, loop
......@@ -198,6 +198,13 @@ def parse_args():
default=False,
help="KV Router: Reset router state on startup, purging stream and object store. By default, states are persisted. WARNING: This can affect existing router replicas.",
)
parser.add_argument(
"--durable-kv-events",
action="store_true",
dest="durable_kv_events",
default=False,
help="KV Router: Enable durable KV events using NATS JetStream instead of NATS Core. By default, the router uses the generic event plane (NATS Core or ZMQ) with local_indexer mode. Use this flag when you need durability and multi-replica consistency. Requires NATS with JetStream enabled.",
)
parser.add_argument(
"--no-track-active-blocks",
action="store_false",
......@@ -354,12 +361,19 @@ async def async_main():
# NATS is needed when:
# 1. Request plane is NATS, OR
# 2. Event plane is NATS AND KV router mode AND (KV events OR replica sync enabled)
# 2. Durable KV events (JetStream) is explicitly requested, OR
# 3. Event plane is NATS AND KV router mode AND (KV events OR replica sync enabled)
# Note: NATS Core (without JetStream) is the default for KV events when durable_kv_events=False
enable_nats = flags.request_plane == "nats" or (
flags.router_mode == "kv"
and (
flags.durable_kv_events
or (
flags.event_plane == "nats"
and flags.router_mode == "kv"
and (flags.use_kv_events or flags.router_replica_sync)
)
)
)
loop = asyncio.get_running_loop()
runtime = DistributedRuntime(loop, flags.store_kv, flags.request_plane, enable_nats)
......@@ -376,6 +390,7 @@ async def async_main():
overlap_score_weight=flags.kv_overlap_score_weight,
router_temperature=flags.router_temperature,
use_kv_events=flags.use_kv_events,
durable_kv_events=flags.durable_kv_events,
router_replica_sync=flags.router_replica_sync,
router_track_active_blocks=flags.router_track_active_blocks,
router_track_output_blocks=flags.router_track_output_blocks,
......
......@@ -115,7 +115,7 @@ def create_temp_engine_args_file(args) -> Path:
),
"is_prefill": getattr(args, "is_prefill_worker", None),
"is_decode": getattr(args, "is_decode_worker", None),
"enable_local_indexer": getattr(args, "enable_local_indexer", None),
"enable_local_indexer": not getattr(args, "durable_kv_events", False),
# Note: bootstrap_port is NOT included here - it's set per-worker in launch_workers()
}
......@@ -301,10 +301,10 @@ def parse_args():
help="Mark this as a decode worker which does not publish KV events and skips prefill cost estimation (default: False)",
)
parser.add_argument(
"--enable-local-indexer",
"--durable-kv-events",
action="store_true",
default=False,
help="Enable worker-local KV indexer for tracking this worker's own KV cache state (default: False)",
default=os.environ.get("DYN_DURABLE_KV_EVENTS", "false").lower() == "true",
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
parser.add_argument(
"--bootstrap-ports",
......
......@@ -20,6 +20,7 @@ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
from dynamo.common.config_dump import register_encoder
from dynamo.common.utils.runtime import parse_endpoint
from dynamo.llm import fetch_llm
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang import __version__
......@@ -118,12 +119,11 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"default": os.environ.get("DYN_EVENT_PLANE", "nats"),
"help": "Determines how events are published [nats|zmq]",
},
"enable-local-indexer": {
"flags": ["--enable-local-indexer"],
"type": str,
"choices": ["true", "false"],
"default": os.environ.get("DYN_LOCAL_INDEXER", "false"),
"help": "Enable worker-local KV indexer for tracking this worker's own KV cache state (can also be toggled with env var DYN_LOCAL_INDEXER).",
"durable-kv-events": {
"flags": ["--durable-kv-events"],
"action": "store_true",
"default": os.environ.get("DYN_DURABLE_KV_EVENTS", "false").lower() == "true",
"help": "Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
},
"image-diffusion-worker": {
"flags": ["--image-diffusion-worker"],
......@@ -182,7 +182,7 @@ class DynamoArgs:
# config dump options
dump_config_to: Optional[str] = None
# local indexer option
enable_local_indexer: bool = False
enable_local_indexer: bool = True
# Whether to enable NATS for KV events (derived from server_args.kv_events_config)
use_kv_events: bool = False
......@@ -335,7 +335,12 @@ async def parse_args(args: list[str]) -> Config:
if "choices" in info:
kwargs["choices"] = info["choices"]
if "action" in info:
kwargs["action"] = info["action"]
action = info["action"]
# Handle string "BooleanOptionalAction" for dict-based config
if action == "BooleanOptionalAction":
kwargs["action"] = argparse.BooleanOptionalAction
else:
kwargs["action"] = action
parser.add_argument(*info["flags"], **kwargs)
......@@ -447,15 +452,9 @@ async def parse_args(args: list[str]) -> Config:
endpoint = f"dyn://{namespace}.backend.generate"
# Always parse the endpoint (whether auto-generated or user-provided)
endpoint_str = endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
logging.error(
f"Invalid endpoint format: '{endpoint}'. Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint(
endpoint
)
sys.exit(1)
parsed_namespace, parsed_component_name, parsed_endpoint_name = endpoint_parts
# Validate parser flags: error if both --{name} and --dyn-{name} are set.
# --dyn-{name} choices are validated by argparse; --{name} by SGLang.
......@@ -598,7 +597,7 @@ async def parse_args(args: list[str]) -> Config:
image_diffusion_fs_url=getattr(parsed_args, "image_diffusion_fs_url", None),
image_diffusion_base_url=getattr(parsed_args, "image_diffusion_base_url", None),
dump_config_to=parsed_args.dump_config_to,
enable_local_indexer=str(parsed_args.enable_local_indexer).lower() == "true",
enable_local_indexer=not parsed_args.durable_kv_events,
use_kv_events=use_kv_events,
)
logging.debug(f"Dynamo args: {dynamo_args}")
......@@ -625,31 +624,6 @@ def reserve_free_port(host: str = "localhost") -> Generator[int, None, None]:
sock.close()
def parse_endpoint(endpoint: str) -> List[str]:
"""Parse endpoint string into namespace, component, and endpoint parts.
Args:
endpoint: Endpoint string in 'dyn://namespace.component.endpoint' format.
Returns:
List of [namespace, component, endpoint] strings.
Raises:
ValueError: If endpoint format is invalid.
"""
endpoint_str = endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
error_msg = (
f"Invalid endpoint format: '{endpoint}'. "
f"Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
logging.error(error_msg)
raise ValueError(error_msg)
return endpoint_parts
def _reserve_disaggregation_bootstrap_port() -> int:
"""Reserve a unique port for disaggregation bootstrap.
......
......@@ -4,7 +4,6 @@
import asyncio
import logging
import os
import signal
import sys
import sglang as sgl
......@@ -13,6 +12,7 @@ import uvloop
from dynamo.common.config_dump import dump_config
from dynamo.common.storage import get_fs
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.runtime import create_runtime
from dynamo.llm import ModelInput, ModelType
from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
......@@ -92,33 +92,14 @@ async def worker():
config.server_args.load_format = setup_gms(config.server_args)
loop = asyncio.get_running_loop()
# Set DYN_EVENT_PLANE environment variable based on config
os.environ["DYN_EVENT_PLANE"] = config.dynamo_args.event_plane
# NATS is needed when:
# 1. Request plane is NATS, OR
# 2. Event plane is NATS AND use_kv_events is True
enable_nats = config.dynamo_args.request_plane == "nats" or (
config.dynamo_args.event_plane == "nats" and config.dynamo_args.use_kv_events
dynamo_args = config.dynamo_args
runtime, _ = create_runtime(
store_kv=dynamo_args.store_kv,
request_plane=dynamo_args.request_plane,
event_plane=dynamo_args.event_plane,
use_kv_events=dynamo_args.use_kv_events,
)
runtime = DistributedRuntime(
loop,
config.dynamo_args.store_kv,
config.dynamo_args.request_plane,
enable_nats,
)
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logging.info("Signal handlers will trigger a graceful shutdown of the runtime")
if config.dynamo_args.image_diffusion_worker:
await init_image_diffusion(runtime, config)
elif config.dynamo_args.embedding_worker:
......@@ -738,12 +719,6 @@ async def _warmup_prefill_engine(engine: sgl.Engine, server_args) -> None:
logging.warning(f"Prefill warmup failed: {e}")
async def graceful_shutdown(runtime):
logging.info("Received shutdown signal, shutting down DistributedRuntime")
runtime.shutdown()
logging.info("DistributedRuntime shutdown complete")
def main():
uvloop.run(worker())
......
......@@ -160,7 +160,11 @@ async def _get_runtime_config(
# set reasoning parser and tool call parser
runtime_config.reasoning_parser = dynamo_args.reasoning_parser
runtime_config.tool_call_parser = dynamo_args.tool_call_parser
runtime_config.enable_local_indexer = dynamo_args.enable_local_indexer
# Decode workers don't create the WorkerKvQuery endpoint, so don't advertise local indexer
is_decode_worker = server_args.disaggregation_mode == "decode"
runtime_config.enable_local_indexer = (
dynamo_args.enable_local_indexer and not is_decode_worker
)
# Set data_parallel_size for DP attention mode
# This enables the router to correctly track per-(worker_id, dp_rank) pairs
......
......@@ -5,7 +5,6 @@ import asyncio
import json
import logging
import os
import signal
import sys
# Configure TLLM_LOG_LEVEL before importing tensorrt_llm
......@@ -39,6 +38,7 @@ import dynamo.nixl_connect as nixl_connect
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.common.utils.runtime import create_runtime, parse_endpoint
from dynamo.llm import (
KvEventPublisher,
ModelInput,
......@@ -58,12 +58,7 @@ from dynamo.trtllm.request_handlers.handlers import (
RequestHandlerConfig,
RequestHandlerFactory,
)
from dynamo.trtllm.utils.trtllm_utils import (
Config,
cmd_line_args,
deep_update,
parse_endpoint,
)
from dynamo.trtllm.utils.trtllm_utils import Config, cmd_line_args, deep_update
# Default buffer size for kv cache events.
DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024
......@@ -71,13 +66,6 @@ DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024
configure_dynamo_logging()
async def graceful_shutdown(runtime, shutdown_event):
logging.info("Received shutdown signal, shutting down DistributedRuntime")
shutdown_event.set()
runtime.shutdown()
logging.info("DistributedRuntime shutdown complete")
async def get_engine_runtime_config(
engine: TensorRTLLMEngine, config: Config
) -> ModelRuntimeConfig:
......@@ -128,34 +116,15 @@ def build_kv_connector_config(config: Config):
async def worker():
config = cmd_line_args()
loop = asyncio.get_running_loop()
# Create shutdown event
shutdown_event = asyncio.Event()
# Set DYN_EVENT_PLANE environment variable based on config
os.environ["DYN_EVENT_PLANE"] = config.event_plane
# NATS is needed when:
# 1. Request plane is NATS, OR
# 2. Event plane is NATS AND use_kv_events is True
enable_nats = config.request_plane == "nats" or (
config.event_plane == "nats" and config.use_kv_events
)
runtime = DistributedRuntime(
loop, config.store_kv, config.request_plane, enable_nats
runtime, _ = create_runtime(
store_kv=config.store_kv,
request_plane=config.request_plane,
event_plane=config.event_plane,
use_kv_events=config.use_kv_events,
shutdown_event=shutdown_event,
)
# Set up signal handler for graceful shutdown
def signal_handler():
# Schedule the shutdown coroutine instead of calling it directly
asyncio.create_task(graceful_shutdown(runtime, shutdown_event))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logging.info("Signal handlers set up for graceful shutdown")
await init(runtime, config, shutdown_event)
......@@ -391,7 +360,11 @@ async def init(
runtime_config.max_num_batched_tokens = config.max_num_tokens
runtime_config.reasoning_parser = config.reasoning_parser
runtime_config.tool_call_parser = config.tool_call_parser
runtime_config.enable_local_indexer = config.enable_local_indexer
# Decode workers don't create the WorkerKvQuery endpoint, so don't advertise local indexer
runtime_config.enable_local_indexer = (
config.enable_local_indexer
and config.disaggregation_mode != DisaggregationMode.DECODE
)
# Set data_parallel_size for attention DP mode
# This enables the router's scheduler to correctly iterate over all dp_ranks
# Need to name ADP as `data_parallel_size` for parity with other frameworks
......
......@@ -9,6 +9,7 @@ from tensorrt_llm.llmapi import BuildConfig
from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
from dynamo.common.config_dump import add_config_dump_args, register_encoder
from dynamo.common.utils.runtime import parse_endpoint
from dynamo.trtllm import __version__
from dynamo.trtllm.request_handlers.handler_base import DisaggregationMode
......@@ -63,7 +64,7 @@ class Config:
self.store_kv: str = ""
self.request_plane: str = ""
self.event_plane: str = ""
self.enable_local_indexer: bool = False
self.enable_local_indexer: bool = True
# Whether to enable NATS for KV events (derived from publish_events_and_metrics)
self.use_kv_events: bool = False
......@@ -114,30 +115,6 @@ def _preprocess_for_encode_config(
return obj.__dict__
def parse_endpoint(endpoint: str) -> tuple[str, str, str]:
"""Parse a Dynamo endpoint string into its components.
Args:
endpoint: Endpoint string in format 'namespace.component.endpoint'
or 'dyn://namespace.component.endpoint'.
Returns:
Tuple of (namespace, component, endpoint_name).
Raises:
ValueError: If endpoint format is invalid.
"""
endpoint_str = endpoint.replace("dyn://", "", 1)
endpoint_parts = endpoint_str.split(".")
if len(endpoint_parts) != 3:
raise ValueError(
f"Invalid endpoint format: '{endpoint}'. "
"Expected 'dyn://namespace.component.endpoint' or 'namespace.component.endpoint'."
)
namespace, component, endpoint_name = endpoint_parts
return namespace, component, endpoint_name
def cmd_line_args():
"""Parse command-line arguments for the TensorRT-LLM backend.
......@@ -350,11 +327,10 @@ def cmd_line_args():
help="Determines how events are published [nats|zmq]",
)
parser.add_argument(
"--enable-local-indexer",
type=str,
choices=["true", "false"],
default=os.environ.get("DYN_LOCAL_INDEXER", "false"),
help="Enable worker-local KV indexer for tracking this worker's own KV cache state (can also be toggled with env var DYN_LOCAL_INDEXER).",
"--durable-kv-events",
action="store_true",
default=os.environ.get("DYN_DURABLE_KV_EVENTS", "false").lower() == "true",
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
args = parser.parse_args()
......@@ -420,7 +396,7 @@ def cmd_line_args():
config.store_kv = args.store_kv
config.request_plane = args.request_plane
config.event_plane = args.event_plane
config.enable_local_indexer = str(args.enable_local_indexer).lower() == "true"
config.enable_local_indexer = not args.durable_kv_events
# Derive use_kv_events from publish_events_and_metrics
config.use_kv_events = config.publish_events_and_metrics
config.connector = args.connector
......
......@@ -40,7 +40,7 @@ class Config:
store_kv: str
request_plane: str
event_plane: str
enable_local_indexer: bool = False
enable_local_indexer: bool = True
# mirror vLLM
model: str
......@@ -289,11 +289,11 @@ def parse_args() -> Config:
help="Determines how events are published [nats|zmq]",
)
parser.add_argument(
"--enable-local-indexer",
type=str,
choices=["true", "false"],
default=os.environ.get("DYN_LOCAL_INDEXER", "false"),
help="Enable worker-local KV indexer for tracking this worker's own KV cache state (can also be toggled with env var DYN_LOCAL_INDEXER).",
"--durable-kv-events",
action="store_true",
dest="durable_kv_events",
default=os.environ.get("DYN_DURABLE_KV_EVENTS", "false").lower() == "true",
help="Enable durable KV events using NATS JetStream instead of the local indexer. By default, local indexer is enabled for lower latency. Use this flag when you need durability and multi-replica router consistency. Requires NATS with JetStream enabled. Can also be set via DYN_DURABLE_KV_EVENTS=true env var.",
)
parser.add_argument(
"--use-vllm-tokenizer",
......@@ -312,7 +312,6 @@ def parse_args() -> Config:
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
args.enable_local_indexer = str(args.enable_local_indexer).lower() == "true"
engine_args = AsyncEngineArgs.from_cli_args(args)
if hasattr(engine_args, "stream_interval") and engine_args.stream_interval != 1:
......@@ -452,7 +451,7 @@ def parse_args() -> Config:
config.store_kv = args.store_kv
config.request_plane = args.request_plane
config.event_plane = args.event_plane
config.enable_local_indexer = args.enable_local_indexer
config.enable_local_indexer = not args.durable_kv_events
config.use_vllm_tokenizer = args.use_vllm_tokenizer
config.sleep_mode_level = args.sleep_mode_level
# use_kv_events is set later in overwrite_args() based on kv_events_config
......
......@@ -4,7 +4,6 @@
import asyncio
import logging
import os
import signal
import tempfile
from typing import Optional
......@@ -18,6 +17,7 @@ from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from dynamo.common.config_dump import dump_config
from dynamo.common.utils.endpoint_types import parse_endpoint_types
from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.common.utils.runtime import create_runtime
from dynamo.llm import (
KvEventPublisher,
ModelInput,
......@@ -118,23 +118,9 @@ async def await_checkpoint_and_was_restored(signal_file: str) -> bool:
await asyncio.sleep(1)
async def graceful_shutdown(runtime, shutdown_event):
"""
Shutdown dynamo distributed runtime.
The endpoints will be immediately invalidated so no new requests will be accepted.
For endpoints served with graceful_shutdown=True, the serving function will wait until all in-flight requests are finished.
For endpoints served with graceful_shutdown=False, the serving function will return immediately.
"""
logging.info("Received shutdown signal, shutting down DistributedRuntime")
shutdown_event.set()
runtime.shutdown()
logging.info("DistributedRuntime shutdown complete")
async def worker():
config = parse_args()
loop = asyncio.get_running_loop()
overwrite_args(config)
dump_config(config.dump_config_to, config)
......@@ -217,32 +203,15 @@ async def worker():
logger.info("Exiting after checkpoint completion")
return
# Create shutdown event
shutdown_event = asyncio.Event()
# Set DYN_EVENT_PLANE environment variable based on config
os.environ["DYN_EVENT_PLANE"] = config.event_plane
# NATS is needed when:
# 1. Request plane is NATS, OR
# 2. Event plane is NATS AND use_kv_events is True
enable_nats = config.request_plane == "nats" or (
config.event_plane == "nats" and config.use_kv_events
)
runtime = DistributedRuntime(
loop, config.store_kv, config.request_plane, enable_nats
runtime, _ = create_runtime(
store_kv=config.store_kv,
request_plane=config.request_plane,
event_plane=config.event_plane,
use_kv_events=config.use_kv_events,
shutdown_event=shutdown_event,
)
# Set up signal handler for graceful shutdown
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime, shutdown_event))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logging.debug("Signal handlers set up for graceful shutdown")
# Route to appropriate initialization based on config flags
if config.vllm_native_encoder_worker:
await init_vllm_native_encoder(runtime, config, shutdown_event)
......@@ -521,7 +490,10 @@ async def register_vllm_model(
runtime_config.total_kv_blocks = runtime_values["num_gpu_blocks"]
runtime_config.max_num_seqs = runtime_values["max_num_seqs"]
runtime_config.max_num_batched_tokens = runtime_values["max_num_batched_tokens"]
runtime_config.enable_local_indexer = config.enable_local_indexer
# Decode workers don't create the WorkerKvQuery endpoint, so don't advertise local indexer
runtime_config.enable_local_indexer = (
config.enable_local_indexer and not config.is_decode_worker
)
# Add tool/reasoning parsers for decode models
if model_type != ModelType.Prefill:
......
......@@ -147,20 +147,26 @@ The main KV-aware routing arguments:
- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - those are synchronized through JetStream events
- `--durable-kv-events`: Enables JetStream mode for KV event transport. Must be specified on **both** the frontend **and** all workers. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane.
- `--router-reset-states`: When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. By default (when this flag is not provided), the router persists state across restarts, downloading any available snapshot from NATS object store and continuing to consume events from where it left off. This enables routers to maintain KV cache awareness across restarts. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
- `--router-snapshot-threshold`: Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--router-reset-states`: Only applies in JetStream mode (`--durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-snapshot-threshold`: Only applies in JetStream mode (`--durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--no-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management.
- `--track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward `expected_output_tokens`. This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth.
- `--no-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
- `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines publish load metrics. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.
- `--active-prefill-tokens-threshold-frac`: Fraction of `max_num_batched_tokens` for busy detection. A worker is marked busy when `active_prefill_tokens > frac * max_num_batched_tokens`. Uses OR logic with `--active-prefill-tokens-threshold` (worker is busy if either threshold is exceeded). If not set, fractional busy detection is disabled.
- `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
......@@ -169,14 +175,15 @@ The main KV-aware routing arguments:
>[!Note]
> **State persistence** depends on the event transport mode:
> - **JetStream mode** (default): State persists across router restarts via JetStream and NATS object store snapshots.
> - **NATS Core with Local Indexer mode** (`--enable-local-indexer` on workers): State persists on workers—router rebuilds state by querying workers on startup.
> - **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes.
> - **JetStream mode** (`--durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots.
> - **No KV events** (`--no-kv-events`): State persistence is not supported.
>
> **Request plane is independent of KV event transport.**
> `DYN_REQUEST_PLANE` controls how **requests** are sent (TCP/HTTP/NATS), but KV-aware routing still uses **NATS** for KV events in both JetStream and NATS Core + Local Indexer modes.
> When KV events are enabled (default), NATS is automatically initialized. You can optionally set `NATS_SERVER=nats://...` to specify a custom NATS server; otherwise, it defaults to `localhost:4222`.
> Use `--no-kv-events` to disable KV events and remove the NATS requirement entirely (with request plane being `tcp` or `http`).
> The router can run without etcd or NATS when using ZMQ event plane (`--event-plane zmq`) and file/mem store (`--store-kv file` or `--store-kv mem`); in this case, KV events use ZMQ transport instead of NATS.
> `DYN_REQUEST_PLANE` controls how **requests** are sent (TCP/HTTP/NATS), but KV-aware routing uses **NATS** for KV events only in JetStream or NATS Core modes (not ZMQ mode).
> When KV events are enabled (default) with NATS-based event plane, NATS is automatically initialized. You can optionally set `NATS_SERVER=nats://...` to specify a custom NATS server; otherwise, it defaults to `localhost:4222`.
> `--no-kv-events` disables KV event transport entirely.
>
> When `--kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning.
>
......@@ -334,7 +341,7 @@ For improved fault tolerance, you can launch multiple frontend + router replicas
The KV Router tracks two types of state (see [Router Design](../../design_docs/router_design.md) for details):
1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - backed by NATS JetStream events and object store snapshots. New router replicas automatically sync this state on startup, ensuring consistent cache awareness across restarts.
1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - in local indexer mode (default) state is rebuilt from workers on startup; in JetStream mode (`--durable-kv-events`) it is backed by JetStream events and object store snapshots.
2. **Active blocks (decoding blocks)**: Tracks blocks currently being used for active generation requests. This state is **ephemeral** - when a new router replica starts, it begins with zero active block knowledge but becomes eventually consistent as it handles requests.
......@@ -359,7 +366,13 @@ Without this flag, each replica maintains its own isolated view of active blocks
Persistence behavior depends on which event transport mode is active:
**JetStream Mode (default):**
**NATS Core / Event Plane with Local Indexer Mode (default):**
- State persists on workers—events are fire-and-forget but workers retain their local indexer state
- On startup, the router queries each worker's local indexer to rebuild state
- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered
- Simpler infrastructure (no JetStream required)
**JetStream Mode** (`--durable-kv-events` on **both** frontend **and** workers)**:**
- Prefix blocks are stored in NATS JetStream with 1-hour retention
- Snapshots saved to NATS object store at configurable thresholds
- New replicas automatically restore this state on startup
......@@ -369,12 +382,6 @@ Persistence behavior depends on which event transport mode is active:
python -m dynamo.frontend --router-mode kv --http-port 8002 --router-replica-sync
```
**NATS Core with Local Indexer Mode:**
- State persists on workers—events are fire-and-forget but workers retain their local indexer state
- On startup, the router queries each worker's local indexer to rebuild state
- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered
- Simpler infrastructure (no JetStream required) but less resilient
>[!Note]
> If you need to start with a fresh state in JetStream mode, you have two options:
> 1. **Recommended**: Use a different namespace/component (see [Distributed Runtime](/docs/design_docs/distributed_runtime.md)) which will start a new stream and NATS object store path
......
......@@ -146,16 +146,20 @@ Each event carries a unique router ID to prevent self-event processing. This asy
The router supports two event transport modes for KV cache state synchronization:
- **JetStream (default)**: Persistent event stream with durable consumers. State persists across router restarts via snapshots in NATS object store. Best for production with multi-replica consistency.
- **NATS Core / Event Plane with Local Indexer (default)**: Fire-and-forget pub/sub where workers maintain local radix trees (enabled by default). Router rebuilds state by querying workers on startup. Lower latency, simpler setup. Works with both NATS Core and ZMQ event planes.
- **NATS Core with Local Indexer** (`--enable-local-indexer` on workers): Fire-and-forget pub/sub where workers maintain local radix trees. Router rebuilds state by querying workers on startup. Lower latency, simpler setup.
- **JetStream** (`--durable-kv-events` on **both** frontend **and** workers): Persistent event stream with durable consumers. State persists across router restarts via snapshots in NATS object store. Best for production with multi-replica consistency. **Important:** Both the frontend and all workers must specify `--durable-kv-events` for JetStream mode to work correctly.
### JetStream Mode
### JetStream Mode (Opt-in)
KV events are sent to a persistent NATS JetStream. Each KV router/indexer replica acts as a durable consumer, pulling messages from this shared stream. This architecture ensures consistency across router replicas and persistence across restarts.
- **Best for**: Production deployments requiring durability and multi-replica router consistency
- **Tradeoffs**: Requires JetStream setup; slightly higher latency due to persistence guarantees
- **Enable with**: `--durable-kv-events` flag on **both** the frontend **and** all workers
> [!Note]
> **Both frontend and workers must specify `--durable-kv-events`** for JetStream mode to work correctly. The frontend uses this flag to consume from JetStream, while workers use it to publish to JetStream instead of the local indexer.
```mermaid
graph TD
......@@ -197,13 +201,13 @@ graph TD
linkStyle 0,1,2,3,4,5 stroke:#2196f3,stroke-width:2px
```
### NATS Core with Local Indexer
### NATS Core / Event Plane with Local Indexer (Default)
When workers are started with `--enable-local-indexer`, each worker maintains its own local radix tree (local indexer) and publishes events over NATS Core (fire-and-forget pub/sub) instead of JetStream. Each worker assigns monotonically increasing event IDs to its events. The router detects gaps in event sequences and recovers missed events by querying the worker's local indexer directly.
By default, workers have local indexer enabled. Each worker maintains its own local radix tree (local indexer) and publishes events over the generic event plane (NATS Core or ZMQ, depending on `--event-plane`). Each worker assigns monotonically increasing event IDs to its events. The router detects gaps in event sequences and recovers missed events by querying the worker's local indexer directly.
- **Best for**: Lower-latency setups; simpler deployments without JetStream; single-router scenarios
- **Best for**: Lower-latency setups; simpler deployments without JetStream; single-router scenarios; deployments without NATS (using ZMQ event plane)
- **Tradeoffs**: State persists on workers (not centralized); recovery depends on workers being available
- **Enable with**: `--enable-local-indexer` flag on workers (vLLM, mocker)
- **Switch to JetStream**: Use `--durable-kv-events` flag on **both** workers (vLLM, SGLang, TRT-LLM, mocker) **and** frontend
```mermaid
graph TD
......@@ -251,7 +255,7 @@ graph TD
- When a worker is removed, the router removes all its blocks from the global radix tree
>[!Note]
> The router automatically selects the transport mode based on worker configuration. If all connected workers have `enable_local_indexer=true`, the router uses NATS Core mode. Otherwise, it uses JetStream mode.
> By default, all workers have `enable_local_indexer=true`, so the router uses NATS Core / Event Plane mode with local indexer. To use JetStream mode instead, specify `--durable-kv-events` on **both** the frontend and all workers.
### Local Active Block Management with Replica Sync
......
......@@ -89,7 +89,6 @@ class CustomEnginePublisher:
worker_id=worker_id,
kv_block_size=block_size,
dp_rank=dp_rank,
enable_local_indexer=False,
)
def on_blocks_stored(self, token_ids: list[int], block_hashes: list[int],
......@@ -196,7 +195,6 @@ config = ZmqKvEventPublisherConfig(
kv_block_size=block_size,
zmq_endpoint="tcp://127.0.0.1:5557", # Where your engine publishes
zmq_topic="", # Subscribe to all topics
enable_local_indexer=False,
)
# Create publisher - it automatically subscribes to ZMQ and forwards to NATS
......
......@@ -90,7 +90,7 @@ python -m dynamo.mocker \
| `--stagger-delay` | -1 (auto) | Delay between worker launches (seconds). 0 disables, -1 enables auto mode |
| `--is-prefill-worker` | False | Prefill-only mode |
| `--is-decode-worker` | False | Decode-only mode |
| `--enable-local-indexer` | False | Enable local KV indexer |
| `--durable-kv-events` | False | Enable durable KV events via JetStream (disables local indexer) |
| `--bootstrap-ports` | None | Ports for P/D rendezvous |
## Architecture
......
......@@ -24,18 +24,15 @@ python -m dynamo.frontend \
--router-mode kv \
--http-port ${DYN_HTTP_PORT_R2:-8001} &
# run workers (enable local indexer so routers can query on restart)
DYN_LOCAL_INDEXER=true \
# run workers (local indexer is enabled by default, so routers can query on restart)
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm \
--model $MODEL \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--enable-local-indexer true \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080","enable_kv_cache_events":true}' &
DYN_LOCAL_INDEXER=true \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
......@@ -43,5 +40,4 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--block-size $BLOCK_SIZE \
--enforce-eager \
--connector none \
--enable-local-indexer true \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081","enable_kv_cache_events":true}'
\ No newline at end of file
......@@ -147,20 +147,26 @@ The main KV-aware routing arguments:
- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - those are synchronized through JetStream events
- `--durable-kv-events`: Enables JetStream mode for KV event transport. Must be specified on **both** the frontend **and** all workers. When enabled, workers publish to JetStream instead of the local indexer, and the frontend consumes from JetStream as a durable consumer. Without this flag (default), workers use the local indexer with NATS Core or ZMQ event plane.
- `--router-reset-states`: When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. By default (when this flag is not provided), the router persists state across restarts, downloading any available snapshot from NATS object store and continuing to consume events from where it left off. This enables routers to maintain KV cache awareness across restarts. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-replica-sync`: Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - in JetStream mode those are synchronized through JetStream events; in local indexer mode (default) each router queries workers directly.
- `--router-snapshot-threshold`: Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--router-reset-states`: Only applies in JetStream mode (`--durable-kv-events`). When specified, resets the router state on startup by clearing both the JetStream event stream and NATS object store, starting with a fresh state. **Warning**: Using `--router-reset-states` can bring existing router replicas into an inconsistent state. Only use this flag when launching the first router replica in a component, or consider using a different namespace/component for a clean slate.
- `--router-snapshot-threshold`: Only applies in JetStream mode (`--durable-kv-events`). Sets the number of messages in the JetStream before triggering a snapshot. When the message count exceeds this threshold, a router will attempt to purge acknowledged messages from the stream and create a snapshot of the current radix tree state in NATS object store. Defaults to 1000000. This helps manage stream size and provides faster initialization for routers that restart.
- `--no-track-active-blocks`: Disables tracking of active blocks (blocks being used for ongoing generation/decode phases). By default, the router tracks active blocks for load balancing. Disable this when routing to workers that only perform prefill (no decode phase), as tracking decode load is not relevant. This reduces router overhead and simplifies state management.
- `--track-output-blocks`: Enables tracking of output blocks during generation (default: disabled). When enabled, the router adds placeholder blocks as tokens are generated and applies fractional decay based on progress toward `expected_output_tokens`. This improves load balancing accuracy for long-running generation requests by accounting for output-side KV cache growth.
- `--no-assume-kv-reuse`: When tracking active blocks, disables the assumption of KV cache reuse. By default (`router_assume_kv_reuse=true`), the router computes actual block hashes for sequence tracking to deduplicate blocks and optimize load balancing. When disabled via this flag, the router generates random hashes for sequence blocks, treating each request's blocks as unique. This is useful in disaggregated setups where prefill transfers blocks to decode workers that may already have those blocks cached, but the engine cannot coordinate transfers to avoid duplication. Without this flag, the router's load balancing heuristics would undercount decode blocks when duplicates exist.
- `--active-decode-blocks-threshold`: Initial threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache block utilization. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, blocks-based busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines publish load metrics. The threshold can be dynamically updated at runtime via the `/busy_threshold` HTTP endpoint (see [Dynamic Threshold Configuration](#dynamic-threshold-configuration)).
- `--active-prefill-tokens-threshold`: Literal token count threshold for determining when a worker is considered busy based on prefill token utilization. When active prefill tokens exceed this threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled.
- `--active-prefill-tokens-threshold-frac`: Fraction of `max_num_batched_tokens` for busy detection. A worker is marked busy when `active_prefill_tokens > frac * max_num_batched_tokens`. Uses OR logic with `--active-prefill-tokens-threshold` (worker is busy if either threshold is exceeded). If not set, fractional busy detection is disabled.
- `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
......@@ -169,14 +175,15 @@ The main KV-aware routing arguments:
>[!Note]
> **State persistence** depends on the event transport mode:
> - **JetStream mode** (default): State persists across router restarts via JetStream and NATS object store snapshots.
> - **NATS Core with Local Indexer mode** (`--enable-local-indexer` on workers): State persists on workers—router rebuilds state by querying workers on startup.
> - **NATS Core / Event Plane mode** (default): State persists on workers—router rebuilds state by querying workers on startup. This is the default when workers have `local_indexer` enabled (which is the default). Works with both NATS Core and ZMQ event planes.
> - **JetStream mode** (`--durable-kv-events` on **both** frontend **and** workers): State persists across router restarts via JetStream and NATS object store snapshots.
> - **No KV events** (`--no-kv-events`): State persistence is not supported.
>
> **Request plane is independent of KV event transport.**
> `DYN_REQUEST_PLANE` controls how **requests** are sent (TCP/HTTP/NATS), but KV-aware routing still uses **NATS** for KV events in both JetStream and NATS Core + Local Indexer modes.
> When KV events are enabled (default), NATS is automatically initialized. You can optionally set `NATS_SERVER=nats://...` to specify a custom NATS server; otherwise, it defaults to `localhost:4222`.
> Use `--no-kv-events` to disable KV events and remove the NATS requirement entirely (with request plane being `tcp` or `http`).
> The router can run without etcd or NATS when using ZMQ event plane (`--event-plane zmq`) and file/mem store (`--store-kv file` or `--store-kv mem`); in this case, KV events use ZMQ transport instead of NATS.
> `DYN_REQUEST_PLANE` controls how **requests** are sent (TCP/HTTP/NATS), but KV-aware routing uses **NATS** for KV events only in JetStream or NATS Core modes (not ZMQ mode).
> When KV events are enabled (default) with NATS-based event plane, NATS is automatically initialized. You can optionally set `NATS_SERVER=nats://...` to specify a custom NATS server; otherwise, it defaults to `localhost:4222`.
> `--no-kv-events` disables KV event transport entirely.
>
> When `--kv-overlap-score-weight` is set to 0, no KVIndexer is created and prefix matching is disabled (pure load balancing). When `--no-kv-events` is set, a KVIndexer is still created but no event subscriber is launched to consume KV events from workers. Instead, the router predicts cache state based on its own routing decisions with TTL-based expiration and pruning.
>
......@@ -295,7 +302,7 @@ await prefill_endpoint.serve_endpoint(prefill_handler.generate)
```
> [!Note]
> The unified frontend with automatic prefill routing is currently enabled for vLLM and TensorRT-LLM backends. For SGLang (work in progress), you need to launch a separate standalone router as the prefill router targeting the prefill endpoints. See example script: [`examples/backends/sglang/launch/disagg_router.sh`](https://github.com/ai-dynamo/dynamo/tree/main/examples/backends/sglang/launch/disagg_router.sh).
> The unified frontend with automatic prefill routing is currently enabled for vLLM and TensorRT-LLM backends. For SGLang (work in progress), you need to launch a separate standalone router as the prefill router targeting the prefill endpoints. See example script: [`examples/backends/sglang/launch/disagg_router.sh`](../../examples/backends/sglang/launch/disagg_router.sh).
### Request Flow
......@@ -334,7 +341,7 @@ For improved fault tolerance, you can launch multiple frontend + router replicas
The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details):
1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - backed by NATS JetStream events and object store snapshots. New router replicas automatically sync this state on startup, ensuring consistent cache awareness across restarts.
1. **Prefix blocks (cached KV blocks)**: Maintained in a radix tree, tracking which blocks are cached on each worker. This state is **persistent** - in local indexer mode (default) state is rebuilt from workers on startup; in JetStream mode (`--durable-kv-events`) it is backed by JetStream events and object store snapshots.
2. **Active blocks (decoding blocks)**: Tracks blocks currently being used for active generation requests. This state is **ephemeral** - when a new router replica starts, it begins with zero active block knowledge but becomes eventually consistent as it handles requests.
......@@ -359,7 +366,13 @@ Without this flag, each replica maintains its own isolated view of active blocks
Persistence behavior depends on which event transport mode is active:
**JetStream Mode (default):**
**NATS Core / Event Plane with Local Indexer Mode (default):**
- State persists on workers—events are fire-and-forget but workers retain their local indexer state
- On startup, the router queries each worker's local indexer to rebuild state
- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered
- Simpler infrastructure (no JetStream required)
**JetStream Mode** (`--durable-kv-events` on **both** frontend **and** workers)**:**
- Prefix blocks are stored in NATS JetStream with 1-hour retention
- Snapshots saved to NATS object store at configurable thresholds
- New replicas automatically restore this state on startup
......@@ -369,12 +382,6 @@ Persistence behavior depends on which event transport mode is active:
python -m dynamo.frontend --router-mode kv --http-port 8002 --router-replica-sync
```
**NATS Core with Local Indexer Mode:**
- State persists on workers—events are fire-and-forget but workers retain their local indexer state
- On startup, the router queries each worker's local indexer to rebuild state
- Recovery depends on workers being available; if a worker is down, its blocks cannot be recovered
- Simpler infrastructure (no JetStream required) but less resilient
>[!Note]
> If you need to start with a fresh state in JetStream mode, you have two options:
> 1. **Recommended**: Use a different namespace/component (see [Distributed Runtime](../../design-docs/distributed-runtime.md)) which will start a new stream and NATS object store path
......
......@@ -89,7 +89,7 @@ python -m dynamo.mocker \
| `--stagger-delay` | -1 (auto) | Delay between worker launches (seconds). 0 disables, -1 enables auto mode |
| `--is-prefill-worker` | False | Prefill-only mode |
| `--is-decode-worker` | False | Decode-only mode |
| `--enable-local-indexer` | False | Enable local KV indexer |
| `--durable-kv-events` | False | Enable durable KV events via JetStream (disables local indexer) |
| `--bootstrap-ports` | None | Ports for P/D rendezvous |
## Architecture
......
......@@ -1764,6 +1764,7 @@ dependencies = [
"tokio-util",
"tracing",
"uuid",
"validator",
]
[[package]]
......
......@@ -51,12 +51,13 @@ impl KvRouterConfig {
#[pymethods]
impl KvRouterConfig {
#[new]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8))]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8))]
#[allow(clippy::too_many_arguments)]
fn new(
overlap_score_weight: f64,
router_temperature: f64,
use_kv_events: bool,
durable_kv_events: bool,
router_replica_sync: bool,
router_track_active_blocks: bool,
router_track_output_blocks: bool,
......@@ -72,6 +73,7 @@ impl KvRouterConfig {
overlap_score_weight,
router_temperature,
use_kv_events,
durable_kv_events,
router_replica_sync,
router_track_active_blocks,
router_track_output_blocks,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment