feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)

Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)
Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
e5850e23 · Yan Ru Pei · GitHub · b302ec41 · e5850e23 · e5850e23
Unverified Commit e5850e23 authored Feb 27, 2026 by Yan Ru Pei Committed by GitHub Feb 28, 2026
11 changed files
--- a/lib/kv-router/src/test_utils.rs
+++ b/lib/kv-router/src/test_utils.rs
@@ -3,10 +3,14 @@
 //! Shared test utilities for radix tree tests.
+use std::future;
 use crate::protocols::{
-    ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData,
+    ActiveLoad, ActiveSequenceEvent, ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData,
-    KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId,
+    KvCacheRemoveData, KvCacheStoreData, KvCacheStoredBlockData, LocalBlockHash, RouterEvent,
+    WorkerConfigLike, WorkerId, WorkerWithDpRank,
 };
+use crate::sequences::SequencePublisher;
 /// Creates blocks with artificial hash mapping (hash * 100) for testing.
 pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> {
@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>)
        },
    }
 }
+/// No-op [`SequencePublisher`] for tests and benchmarks that don't need event transport.
+pub struct NoopSequencePublisher;
+impl SequencePublisher for NoopSequencePublisher {
+    fn publish_event(
+        &self,
+        _event: &ActiveSequenceEvent,
+    ) -> impl future::Future<Output = anyhow::Result<()>> + Send {
+        future::ready(Ok(()))
+    }
+    fn publish_load(&self, _load: ActiveLoad) {}
+    fn observe_load(&self, _: &WorkerWithDpRank, _: &str, _: usize, _: usize) {}
+}
+/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
+#[derive(Debug, Clone)]
+pub struct SimpleWorkerConfig {
+    pub data_parallel_size: u32,
+    pub max_num_batched_tokens: Option<u64>,
+    pub total_kv_blocks: Option<u64>,
+}
+impl Default for SimpleWorkerConfig {
+    fn default() -> Self {
+        Self {
+            data_parallel_size: 1,
+            max_num_batched_tokens: None,
+            total_kv_blocks: None,
+        }
+    }
+}
+impl WorkerConfigLike for SimpleWorkerConfig {
+    fn data_parallel_size(&self) -> u32 {
+        self.data_parallel_size
+    }
+    fn max_num_batched_tokens(&self) -> Option<u64> {
+        self.max_num_batched_tokens
+    }
+    fn total_kv_blocks(&self) -> Option<u64> {
+        self.total_kv_blocks
+    }
+}
--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -27,6 +26,8 @@ use validator::Validate;
 pub use dynamo_kv_router::approx;
 pub use dynamo_kv_router::indexer;
 pub use dynamo_kv_router::protocols;
+pub use dynamo_kv_router::scheduling;
+pub use dynamo_kv_router::selector;
 pub mod cache_control;
 pub mod config;
@@ -56,10 +57,10 @@ use crate::{
        indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError},
        protocols::{
            BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest,
-            RouterResponse, TokensWithHashes, WorkerId, WorkerSelectionResult, WorkerWithDpRank,
+            RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
            compute_block_hash_for_seq,
        },
-        scheduler::{KvScheduler, KvSchedulerError, PotentialLoad, SchedulingRequest},
+        scheduler::{KvScheduler, PotentialLoad},
        sequence::{SequenceError, SequenceRequest},
    },
    local_model::runtime_config::ModelRuntimeConfig,
@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
    }
 }
-/// A trait that users can implement to define custom selection logic
+/// Concrete `WorkerSelector` bound to the runtime config type.
-pub trait WorkerSelector {
+pub type WorkerSelector =
-    fn select_worker(
+    dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
-        &self,
-        workers: &HashMap<protocols::WorkerId, ModelRuntimeConfig>,
-        request: &SchedulingRequest,
-        block_size: u32,
-    ) -> Result<WorkerSelectionResult, KvSchedulerError>;
-}
 #[derive(Clone)]
 pub enum Indexer {
@@ -297,7 +292,7 @@ impl KvRouter {
        client: Client,
        mut workers_with_configs: RuntimeConfigWatch,
        block_size: u32,
-        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
+        selector: Option<Box<WorkerSelector>>,
        kv_router_config: Option<KvRouterConfig>,
        worker_type: &'static str,
    ) -> Result<Self> {

--- a/lib/llm/src/kv_router/config.rs
+++ b/lib/llm/src/kv_router/config.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-use derive_builder::Builder;
+pub use dynamo_kv_router::config::{KvRouterConfig, RouterConfigOverride};
-use rand::Rng;
-use serde::{Deserialize, Serialize};
-use validator::{Validate, ValidationError};
-use crate::kv_router::protocols::{compute_block_hash_for_seq, compute_seq_hash_for_block};
-/// Override configuration for router settings that can be specified per-request
-#[derive(Debug, Clone, Default, Builder, Serialize, Deserialize, Validate)]
-pub struct RouterConfigOverride {
-    #[builder(default)]
-    pub overlap_score_weight: Option<f64>,
-    #[builder(default)]
-    #[validate(range(min = 0.0))]
-    pub router_temperature: Option<f64>,
-    #[builder(default)]
-    pub assume_kv_reuse: Option<bool>,
-}
-/// KV Router configuration parameters
-#[derive(Debug, Clone, Copy, Serialize, Deserialize, Validate)]
-#[validate(schema(function = "validate_kv_router_config"))]
-pub struct KvRouterConfig {
-    #[validate(range(min = 0.0))]
-    pub overlap_score_weight: f64,
-    #[validate(range(min = 0.0))]
-    pub router_temperature: f64,
-    pub use_kv_events: bool,
-    /// **Deprecated:** Enable durable KV events using NATS JetStream instead of the default event plane.
-    /// This option will be removed in a future release. The event-plane subscriber
-    /// (local_indexer mode) is now the recommended path.
-    pub durable_kv_events: bool,
-    pub router_replica_sync: bool,
-    /// Whether to track active blocks in the router (default: true)
-    pub router_track_active_blocks: bool,
-    /// Whether to track output blocks during generation (default: false)
-    /// When enabled, the router adds placeholder blocks as tokens are generated
-    /// and applies fractional decay based on progress toward agent_hints.osl.
-    pub router_track_output_blocks: bool,
-    /// Whether to assume KV cache reuse when tracking active blocks (default: true).
-    /// When true, computes actual block hashes for sequence tracking.
-    /// When false, generates random hashes (assuming no KV cache reuse).
-    pub router_assume_kv_reuse: bool,
-    /// Threshold for triggering snapshots. If None, no snapshots will be performed.
-    #[validate(range(min = 1))]
-    pub router_snapshot_threshold: Option<u32>,
-    /// Whether to reset the router state on startup (default: false)
-    pub router_reset_states: bool,
-    /// TTL for blocks in seconds (only used when use_kv_events is false, default: 120.0)
-    #[validate(range(min = 0.0))]
-    pub router_ttl_secs: f64,
-    /// Maximum tree size before pruning (only used when use_kv_events is false, default: 2^20 = 1048576)
-    #[validate(range(min = 1))]
-    pub router_max_tree_size: usize,
-    /// Target size ratio after pruning (only used when use_kv_events is false, default: 0.8)
-    #[validate(range(min = 0.0, max = 1.0))]
-    pub router_prune_target_ratio: f64,
-    /// Queue threshold fraction for prefill token capacity.
-    /// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
-    /// If None (default), queueing is disabled and all requests go directly to ready.
-    /// Must be > 0.
-    #[validate(range(min = 0.0))]
-    pub router_queue_threshold: Option<f64>,
-    /// Number of event processing threads for the KV indexer.
-    /// When > 1, uses ConcurrentRadixTree with a thread pool instead of the
-    /// single-threaded RadixTree. Default: 4.
-    #[validate(range(min = 1))]
-    pub router_event_threads: u32,
-    /// Enable cache control (PIN with TTL) via the worker's cache_control service mesh endpoint.
-    /// When true, the router creates a cache_control client and honors nvext.cache_control on
-    /// requests, firing a pin_prefix call (with TTL) to the worker after generation completes.
-    /// When false (default), cache_control is ignored and no cache_control client is created.
-    pub router_enable_cache_control: bool,
-}
-impl Default for KvRouterConfig {
-    fn default() -> Self {
-        Self {
-            overlap_score_weight: 1.0,
-            router_temperature: 0.0,
-            use_kv_events: true,
-            durable_kv_events: false, // default to NATS Core (local indexer mode)
-            router_replica_sync: false,
-            router_track_active_blocks: true,
-            router_track_output_blocks: false,
-            router_assume_kv_reuse: true,
-            router_snapshot_threshold: Some(1000000),
-            router_reset_states: false,
-            router_ttl_secs: 120.0,
-            router_max_tree_size: 2usize.pow(20), // 2^20 = 1048576, matches PruneConfig::default()
-            router_prune_target_ratio: 0.8,
-            router_queue_threshold: None,
-            router_event_threads: 4,
-            router_enable_cache_control: false,
-        }
-    }
-}
-fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationError> {
-    if config.durable_kv_events {
-        tracing::warn!(
-            "--durable-kv-events is deprecated and will be removed in a future release. \
-             The event-plane subscriber (local_indexer mode) is now the recommended path."
-        );
-    }
-    if config.durable_kv_events && !config.use_kv_events {
-        return Err(ValidationError::new(
-            "durable_kv_events requires use_kv_events=true",
-        ));
-    }
-    if config.router_track_output_blocks && !config.router_track_active_blocks {
-        return Err(ValidationError::new(
-            "router_track_output_blocks requires router_track_active_blocks=true",
-        ));
-    }
-    Ok(())
-}
-impl KvRouterConfig {
-    /// Compute sequence hashes for active block tracking based on configuration.
-    ///
-    /// Returns:
-    /// - `None` if `router_track_active_blocks` is false
-    /// - Random hashes if `router_track_active_blocks` is true but `router_assume_kv_reuse` is false
-    /// - Actual sequence hashes if both are true
-    pub fn compute_seq_hashes_for_tracking(
-        &self,
-        tokens: &[u32],
-        block_size: u32,
-        config_override: Option<&RouterConfigOverride>,
-        lora_name: Option<&str>,
-    ) -> Option<Vec<u64>> {
-        if !self.router_track_active_blocks {
-            return None;
-        }
-        let num_blocks = tokens.len() / block_size as usize;
-        if num_blocks == 0 {
-            return Some(Vec::new());
-        }
-        let assume_kv_reuse = config_override
-            .and_then(|cfg| cfg.assume_kv_reuse)
-            .unwrap_or(self.router_assume_kv_reuse);
-        if assume_kv_reuse {
-            let block_hashes = compute_block_hash_for_seq(tokens, block_size, None, lora_name);
-            Some(compute_seq_hash_for_block(&block_hashes))
-        } else {
-            let mut rng = rand::rng();
-            Some((0..num_blocks).map(|_| rng.random::<u64>()).collect())
-        }
-    }
-    /// Check if KV event subscription should be started.
-    ///
-    /// Returns false if:
-    /// - KV events are disabled (`use_kv_events=false`)
-    /// - Overlap scoring is disabled (`overlap_score_weight=0`)
-    ///
-    /// When false, the router skips starting the KV event subscription entirely,
-    /// avoiding the need to query workers for their local indexer state.
-    pub fn should_subscribe_to_kv_events(&self) -> bool {
-        self.use_kv_events && self.overlap_score_weight > 0.0
-    }
-}
--- a/lib/llm/src/kv_router/queue.rs
+++ b/lib/llm/src/kv_router/queue.rs
--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
--- a/lib/llm/src/local_model/runtime_config.rs
+++ b/lib/llm/src/local_model/runtime_config.rs
--- a/lib/llm/src/mocker.rs
+++ b/lib/llm/src/mocker.rs
--- a/lib/mocker/src/scheduler/vllm.rs
+++ b/lib/mocker/src/scheduler/vllm.rs
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py