feat: default router_event_threads to 4 (#6672)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat: default router_event_threads to 4 (#6672)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
55c0a769 · Yan Ru Pei · GitHub · a110abfb · 55c0a769 · 55c0a769
Unverified Commit 55c0a769 authored Feb 27, 2026 by Yan Ru Pei Committed by GitHub Feb 27, 2026
13 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -357,9 +357,10 @@ class FrontendArgGroup(ArgGroup):
            g,
            flag_name="--router-event-threads",
            env_var="DYN_ROUTER_EVENT_THREADS",
-            default=1,
+            default=4,
            help=(
-                "KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput."
+                "KV Router: Number of event processing threads. When > 1, uses a concurrent radix tree with a thread pool for higher throughput. "
+                "Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning)."
            ),
            arg_type=int,
        )

--- a/components/src/dynamo/router/backend_args.py
+++ b/components/src/dynamo/router/backend_args.py
@@ -195,7 +195,7 @@ class DynamoRouterArgGroup(ArgGroup):
            g,
            flag_name="--router-event-threads",
            env_var="DYN_ROUTER_EVENT_THREADS",
-            default=1,
+            default=4,
-            help="KV Router: Number of event processing threads. >1 uses concurrent radix tree and thread pool for higher throughput.",
+            help="KV Router: Number of event processing threads. >1 uses concurrent radix tree and thread pool for higher throughput. Ignored when --no-router-kv-events is set (approximate mode always uses single-threaded indexer with TTL/pruning).",
            arg_type=int,
        )
--- a/docs/pages/components/router/router-guide.md
+++ b/docs/pages/components/router/router-guide.md
@@ -176,7 +176,7 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
 - `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-router-kv-events` is used. This creates headroom before the next pruning cycle.
- `--router-event-threads`: Number of event processing threads for the KV indexer. When set to 1 (default), the router uses a single-threaded radix tree with channel-based event processing, which supports TTL-based expiration and pruning. When set to a value greater than 1, the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. Note: the concurrent indexer does not support TTL/pruning (`--router-ttl-secs`, `--router-max-tree-size`, `--router-prune-target-ratio` are ignored when `--router-event-threads > 1`). Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../../lib/kv-router/README.md).
+- `--router-event-threads`: Number of event processing threads for the KV indexer (default: 4). When set to 1, the router uses a single-threaded radix tree with channel-based event processing. When set to a value greater than 1 (the default), the router uses a concurrent radix tree with a thread pool of the specified size for higher event throughput. This setting only applies when KV events are enabled (the default). When `--no-router-kv-events` is set (approximate mode), the router always uses a single-threaded indexer with TTL-based expiration and pruning regardless of this setting. Can be set via `DYN_ROUTER_EVENT_THREADS` env var. For details on the underlying index data structures (`RadixTree`, `ConcurrentRadixTree`, `PositionalIndexer`) and their concurrency model (inline reads, sticky-routed writes via thread pool), see the [KV Router Index documentation](../../../../lib/kv-router/README.md).
 <Note>

--- a/docs/pages/design-docs/router-design.md
+++ b/docs/pages/design-docs/router-design.md
@@ -131,9 +131,9 @@ The KVIndexer has a method `find_matches_for_request`, which takes in tokens and
 The KVIndexer supports two backend implementations, selected via `--router-event-threads`:
- **Single-threaded RadixTree** (default, `--router-event-threads 1`): Events are processed in a dedicated single-threaded tokio runtime via channel-based dispatch. Supports TTL-based expiration and size-based pruning (for `--no-kv-events` approximate mode).
+- **Single-threaded RadixTree** (`--router-event-threads 1`): Events are processed in a dedicated single-threaded tokio runtime via channel-based dispatch. Supports TTL-based expiration and size-based pruning (for `--no-kv-events` approximate mode).
- **ConcurrentRadixTree** (`--router-event-threads N` where N > 1): A thread-safe radix tree with a pool of N worker threads for event processing. Uses sticky worker routing (events for the same worker always go to the same thread) to ensure per-worker event serialization. Read operations (`find_matches`) execute concurrently with writes. Does not support TTL/pruning.
+- **ConcurrentRadixTree** (default, `--router-event-threads N` where N > 1): A thread-safe radix tree with a pool of N worker threads for event processing (default: 4). Uses sticky worker routing (events for the same worker always go to the same thread) to ensure per-worker event serialization. Read operations (`find_matches`) execute concurrently with writes. Does not support TTL/pruning.
 ### Inter-Router Communication

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -52,7 +52,7 @@ impl KvRouterConfig {
 #[pymethods]
 impl KvRouterConfig {
    #[new]
-    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=None, router_event_threads=1, router_enable_cache_control=false))]
+    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=None, router_event_threads=4, router_enable_cache_control=false))]
    #[allow(clippy::too_many_arguments)]
    fn new(
        overlap_score_weight: f64,

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -981,7 +981,7 @@ class KvRouterConfig:
        router_max_tree_size: int = 1048576,
        router_prune_target_ratio: float = 0.8,
        router_queue_threshold: Optional[float] = None,
-        router_event_threads: int = 1,
+        router_event_threads: int = 4,
    ) -> None:
        """
        Create a KV router configuration.
@@ -1010,7 +1010,7 @@ class KvRouterConfig:
                When set, requests are queued if all workers exceed this fraction of
                max_num_batched_tokens. Enables priority scheduling via latency_sensitivity hints.
                If None, queueing is disabled and all requests go directly to the scheduler.
-            router_event_threads: Number of event processing threads (default: 1).
+            router_event_threads: Number of event processing threads (default: 4).
                When > 1, uses a concurrent radix tree with a thread pool.
        """
        ...

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -155,6 +155,26 @@ impl Indexer {
            return Indexer::None;
        }
+        // Approximate mode (--no-kv-events): always use single-threaded KvIndexer
+        // with TTL/pruning regardless of event_threads, since updates come from
+        // routing decisions only, not live KV events from workers.
+        if !kv_router_config.use_kv_events {
+            let kv_indexer_metrics = indexer::KvIndexerMetrics::from_component(component);
+            let cancellation_token = component.drt().primary_token();
+            let prune_config = Some(PruneConfig {
+                ttl: Duration::from_secs_f64(kv_router_config.router_ttl_secs),
+                max_tree_size: kv_router_config.router_max_tree_size,
+                prune_target_ratio: kv_router_config.router_prune_target_ratio,
+            });
+            return Indexer::KvIndexer(KvIndexer::new_with_frequency(
+                cancellation_token,
+                None,
+                block_size,
+                kv_indexer_metrics,
+                prune_config,
+            ));
+        }
        if kv_router_config.router_event_threads > 1 {
            return Indexer::Concurrent(Arc::new(ThreadPoolIndexer::new(
                ConcurrentRadixTree::new(),
@@ -166,23 +186,12 @@ impl Indexer {
        let kv_indexer_metrics = indexer::KvIndexerMetrics::from_component(component);
        let cancellation_token = component.drt().primary_token();
-        // If use_kv_events is false, enable TTL and pruning for approximate behavior
-        let prune_config = if !kv_router_config.use_kv_events {
-            Some(PruneConfig {
-                ttl: Duration::from_secs_f64(kv_router_config.router_ttl_secs),
-                max_tree_size: kv_router_config.router_max_tree_size,
-                prune_target_ratio: kv_router_config.router_prune_target_ratio,
-            })
-        } else {
-            None
-        };
        Indexer::KvIndexer(KvIndexer::new_with_frequency(
            cancellation_token,
            None, // expiration_duration for frequency tracking
            block_size,
            kv_indexer_metrics,
-            prune_config,
+            None,
        ))
    }

--- a/lib/llm/src/kv_router/config.rs
+++ b/lib/llm/src/kv_router/config.rs
@@ -82,7 +82,7 @@ pub struct KvRouterConfig {
    /// Number of event processing threads for the KV indexer.
    /// When > 1, uses ConcurrentRadixTree with a thread pool instead of the
-    /// single-threaded RadixTree. Default: 1.
+    /// single-threaded RadixTree. Default: 4.
    #[validate(range(min = 1))]
    pub router_event_threads: u32,
@@ -110,7 +110,7 @@ impl Default for KvRouterConfig {
            router_max_tree_size: 2usize.pow(20), // 2^20 = 1048576, matches PruneConfig::default()
            router_prune_target_ratio: 0.8,
            router_queue_threshold: None,
-            router_event_threads: 1,
+            router_event_threads: 4,
            router_enable_cache_control: false,
        }
    }
@@ -128,11 +128,6 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr
            "durable_kv_events requires use_kv_events=true",
        ));
    }
-    if !config.use_kv_events && config.router_event_threads > 1 {
-        return Err(ValidationError::new(
-            "router_event_threads > 1 requires use_kv_events=true",
-        ));
-    }
    if config.router_track_output_blocks && !config.router_track_active_blocks {
        return Err(ValidationError::new(
            "router_track_output_blocks requires router_track_active_blocks=true",

--- a/tests/router/common.py
+++ b/tests/router/common.py
@@ -1353,7 +1353,7 @@ def _test_router_indexers_sync(
    test_nats_interruption: bool = False,
    nats_server: Optional["NatsServer"] = None,
    durable_kv_events: bool = False,
-    router_event_threads: int = 1,
+    router_event_threads: int = 4,
 ):
    """Test that two KV routers have synchronized indexer states after processing requests.
@@ -1920,7 +1920,7 @@ def _test_router_decisions(
    block_size: int = 8,
    use_kv_events: bool = True,
    durable_kv_events: bool = False,
-    router_event_threads: int = 1,
+    router_event_threads: int = 4,
 ):
    """Validate cross-worker routing decisions based on longest prefix match and tree-size tiebreaking.

--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -548,18 +548,16 @@ def test_kv_router_bindings(
 @pytest.mark.parametrize(
-    "store_backend,durable_kv_events,request_plane,router_event_threads",
+    "store_backend,durable_kv_events,request_plane",
    [
-        ("etcd", True, "nats", 1),  # JetStream mode - uses JetStream
+        ("etcd", True, "nats"),  # JetStream mode - uses JetStream
-        ("etcd", False, "tcp", 1),  # NATS core mode (with gap detection) - no JetStream
+        ("etcd", False, "tcp"),  # NATS core mode (with gap detection) - no JetStream
-        ("file", True, "nats", 1),  # File backend - uses JetStream
+        ("file", True, "nats"),  # File backend - uses JetStream
-        ("etcd", False, "tcp", 2),  # NATS core mode - multi-threaded indexer
    ],
    ids=[
        "jetstream",
        "nats_core",
        "file",
-        "nats_core_multi_thread",
    ],
    indirect=["request_plane", "durable_kv_events"],
 )
@@ -572,7 +570,6 @@ def test_indexers_sync(
    store_backend,
    durable_kv_events,
    request_plane,
-    router_event_threads,
 ):
    """
    Test that two KV routers have synchronized indexer states after processing requests.
@@ -625,7 +622,6 @@ def test_indexers_sync(
            test_nats_interruption=not durable_kv_events,
            nats_server=nats_process if not durable_kv_events else None,
            durable_kv_events=durable_kv_events,
-            router_event_threads=router_event_threads,
        )
        logger.info("Indexers sync test completed successfully")
@@ -670,15 +666,14 @@ def test_query_instance_id_returns_worker_and_tokens(
 @pytest.mark.timeout(90)  # bumped for xdist contention (was 29s; ~9.55s serial avg)
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
 @pytest.mark.parametrize(
-    "durable_kv_events,use_kv_events,router_event_threads,zmq_kv_events",
+    "durable_kv_events,use_kv_events,zmq_kv_events",
    [
-        (True, True, 1, False),  # JetStream mode with KV events
+        (True, True, False),  # JetStream mode with KV events
-        (False, True, 1, False),  # NATS Core mode with local indexer (default)
+        (False, True, False),  # NATS Core mode with local indexer (default)
-        (False, False, 1, False),  # Approximate mode (--no-kv-events) - no KV events
+        (False, False, False),  # Approximate mode (--no-kv-events) - no KV events
-        (False, True, 2, False),  # NATS Core mode - multi-threaded indexer
+        (False, True, True),  # ZMQ mode: mocker → ZMQ PUB → relay → NATS
-        (False, True, 1, True),  # ZMQ mode: mocker → ZMQ PUB → relay → NATS
    ],
-    ids=["jetstream", "nats_core", "no_kv_events", "nats_core_multi_thread", "zmq"],
+    ids=["jetstream", "nats_core", "no_kv_events", "zmq"],
    indirect=["durable_kv_events"],
 )
 def test_router_decisions(
@@ -688,7 +683,6 @@ def test_router_decisions(
    durable_kv_events,
    use_kv_events,
    request_plane,
-    router_event_threads,
    zmq_kv_events,
 ):
    """Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes.
@@ -736,7 +730,6 @@ def test_router_decisions(
            test_dp_rank=True,
            use_kv_events=use_kv_events,
            durable_kv_events=durable_kv_events,
-            router_event_threads=router_event_threads,
        )

--- a/tests/router/test_router_e2e_with_sglang.py
+++ b/tests/router/test_router_e2e_with_sglang.py
@@ -387,18 +387,12 @@ def test_sglang_kv_router_basic(
 @pytest.mark.pre_merge
 @pytest.mark.gpu_1
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
-@pytest.mark.parametrize(
-    "router_event_threads",
-    [1, 2],
-    ids=["single_thread", "multi_thread"],
-)
 def test_router_decisions_sglang_multiple_workers(
    request,
    runtime_services_dynamic_ports,
    predownload_models,
    set_ucx_tls_no_mm,
    request_plane,
-    router_event_threads,
 ):
    # runtime_services starts etcd and nats
    logger.info("Starting SGLang router prefix reuse test with two workers")
@@ -425,7 +419,6 @@ def test_router_decisions_sglang_multiple_workers(
            request,
            test_dp_rank=False,
            block_size=PAGE_SIZE,
-            router_event_threads=router_event_threads,
        )

--- a/tests/router/test_router_e2e_with_trtllm.py
+++ b/tests/router/test_router_e2e_with_trtllm.py
@@ -423,11 +423,6 @@ def test_router_decisions_trtllm_attention_dp(
 @pytest.mark.pre_merge
 @pytest.mark.gpu_1
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
-@pytest.mark.parametrize(
-    "router_event_threads",
-    [1, 2],
-    ids=["single_thread", "multi_thread"],
-)
 @pytest.mark.timeout(150)  # ~3x average (~45s/test), rounded up
 def test_router_decisions_trtllm_multiple_workers(
    request,
@@ -435,7 +430,6 @@ def test_router_decisions_trtllm_multiple_workers(
    predownload_models,
    set_ucx_tls_no_mm,
    request_plane,
-    router_event_threads,
 ):
    # runtime_services starts etcd and nats
    logger.info("Starting TRT-LLM router prefix reuse test with two workers")
@@ -464,7 +458,6 @@ def test_router_decisions_trtllm_multiple_workers(
            request,
            test_dp_rank=False,
            block_size=TRTLLM_BLOCK_SIZE,
-            router_event_threads=router_event_threads,
        )

--- a/tests/router/test_router_e2e_with_vllm.py
+++ b/tests/router/test_router_e2e_with_vllm.py
@@ -415,18 +415,12 @@ def test_vllm_kv_router_basic(
 @pytest.mark.gpu_1
 @pytest.mark.timeout(150)  # ~3x average (~43s/test), rounded up
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
-@pytest.mark.parametrize(
-    "router_event_threads",
-    [1, 2],
-    ids=["single_thread", "multi_thread"],
-)
 def test_router_decisions_vllm_multiple_workers(
    request,
    runtime_services_dynamic_ports,
    predownload_models,
    set_ucx_tls_no_mm,
    request_plane,
-    router_event_threads,
 ):
    # runtime_services starts etcd and nats
    logger.info("Starting vLLM router prefix reuse test with two workers")
@@ -454,7 +448,6 @@ def test_router_decisions_vllm_multiple_workers(
            request,
            test_dp_rank=False,
            block_size=BLOCK_SIZE,
-            router_event_threads=router_event_threads,
        )