feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)

Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)
Signed-off-by: PeaBrane <yanrpei@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
e5850e23 · Yan Ru Pei · GitHub · b302ec41 · e5850e23 · e5850e23
Unverified Commit e5850e23 authored Feb 27, 2026 by Yan Ru Pei Committed by GitHub Feb 28, 2026
11 changed files
--- a/lib/kv-router/src/test_utils.rs
+++ b/lib/kv-router/src/test_utils.rs
@@ -3,10 +3,14 @@

 //! Shared test utilities for radix tree tests.

+use std::future;
+
 use crate::protocols::{
-    ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData,
-    KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId,
+    ActiveLoad, ActiveSequenceEvent, ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData,
+    KvCacheRemoveData, KvCacheStoreData, KvCacheStoredBlockData, LocalBlockHash, RouterEvent,
+    WorkerConfigLike, WorkerId, WorkerWithDpRank,
 };
+use crate::sequences::SequencePublisher;

 /// Creates blocks with artificial hash mapping (hash * 100) for testing.
 pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> {
@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>)
        },
    }
 }
+
+/// No-op [`SequencePublisher`] for tests and benchmarks that don't need event transport.
+pub struct NoopSequencePublisher;
+
+impl SequencePublisher for NoopSequencePublisher {
+    fn publish_event(
+        &self,
+        _event: &ActiveSequenceEvent,
+    ) -> impl future::Future<Output = anyhow::Result<()>> + Send {
+        future::ready(Ok(()))
+    }
+
+    fn publish_load(&self, _load: ActiveLoad) {}
+
+    fn observe_load(&self, _: &WorkerWithDpRank, _: &str, _: usize, _: usize) {}
+}
+
+/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
+#[derive(Debug, Clone)]
+pub struct SimpleWorkerConfig {
+    pub data_parallel_size: u32,
+    pub max_num_batched_tokens: Option<u64>,
+    pub total_kv_blocks: Option<u64>,
+}
+
+impl Default for SimpleWorkerConfig {
+    fn default() -> Self {
+        Self {
+            data_parallel_size: 1,
+            max_num_batched_tokens: None,
+            total_kv_blocks: None,
+        }
+    }
+}
+
+impl WorkerConfigLike for SimpleWorkerConfig {
+    fn data_parallel_size(&self) -> u32 {
+        self.data_parallel_size
+    }
+
+    fn max_num_batched_tokens(&self) -> Option<u64> {
+        self.max_num_batched_tokens
+    }
+
+    fn total_kv_blocks(&self) -> Option<u64> {
+        self.total_kv_blocks
+    }
+}
--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

-use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};

@@ -27,6 +26,8 @@ use validator::Validate;
 pub use dynamo_kv_router::approx;
 pub use dynamo_kv_router::indexer;
 pub use dynamo_kv_router::protocols;
+pub use dynamo_kv_router::scheduling;
+pub use dynamo_kv_router::selector;

 pub mod cache_control;
 pub mod config;
@@ -56,10 +57,10 @@ use crate::{
        indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError},
        protocols::{
            BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest,
-            RouterResponse, TokensWithHashes, WorkerId, WorkerSelectionResult, WorkerWithDpRank,
+            RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
            compute_block_hash_for_seq,
        },
-        scheduler::{KvScheduler, KvSchedulerError, PotentialLoad, SchedulingRequest},
+        scheduler::{KvScheduler, PotentialLoad},
        sequence::{SequenceError, SequenceRequest},
    },
    local_model::runtime_config::ModelRuntimeConfig,
@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
    }
 }

-/// A trait that users can implement to define custom selection logic
-pub trait WorkerSelector {
-    fn select_worker(
-        &self,
-        workers: &HashMap<protocols::WorkerId, ModelRuntimeConfig>,
-        request: &SchedulingRequest,
-        block_size: u32,
-    ) -> Result<WorkerSelectionResult, KvSchedulerError>;
-}
+/// Concrete `WorkerSelector` bound to the runtime config type.
+pub type WorkerSelector =
+    dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;

 #[derive(Clone)]
 pub enum Indexer {
@@ -297,7 +292,7 @@ impl KvRouter {
        client: Client,
        mut workers_with_configs: RuntimeConfigWatch,
        block_size: u32,
-        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
+        selector: Option<Box<WorkerSelector>>,
        kv_router_config: Option<KvRouterConfig>,
        worker_type: &'static str,
    ) -> Result<Self> {

--- a/lib/llm/src/kv_router/config.rs
+++ b/lib/llm/src/kv_router/config.rs
--- a/lib/llm/src/kv_router/queue.rs
+++ b/lib/llm/src/kv_router/queue.rs
--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
--- a/lib/llm/src/local_model/runtime_config.rs
+++ b/lib/llm/src/local_model/runtime_config.rs
@@ -80,6 +80,20 @@ impl Default for ModelRuntimeConfig {
    }
 }

+impl dynamo_kv_router::WorkerConfigLike for ModelRuntimeConfig {
+    fn data_parallel_size(&self) -> u32 {
+        self.data_parallel_size
+    }
+
+    fn max_num_batched_tokens(&self) -> Option<u64> {
+        self.max_num_batched_tokens
+    }
+
+    fn total_kv_blocks(&self) -> Option<u64> {
+        self.total_kv_blocks
+    }
+}
+
 impl ModelRuntimeConfig {
    pub fn new() -> Self {
        Self::default()

--- a/lib/llm/src/mocker.rs
+++ b/lib/llm/src/mocker.rs
@@ -214,6 +214,8 @@ pub struct MockVllmEngine {
    engine_args: MockEngineArgs,
    /// Bootstrap server for prefill workers in disaggregated mode
    bootstrap_server: Arc<OnceCell<Arc<BootstrapServer>>>,
+    /// Keep schedulers alive so their CancelGuards don't fire prematurely.
+    _schedulers: OnceCell<Vec<Scheduler>>,
 }

 impl MockVllmEngine {
@@ -225,6 +227,7 @@ impl MockVllmEngine {
            senders_ready: Notify::new(),
            engine_args,
            bootstrap_server: Arc::new(OnceCell::new()),
+            _schedulers: OnceCell::new(),
        }
    }

@@ -268,6 +271,8 @@ impl MockVllmEngine {

        Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?;

+        let _ = self._schedulers.set(schedulers);
+
        Ok(())
    }


--- a/lib/mocker/src/scheduler/vllm.rs
+++ b/lib/mocker/src/scheduler/vllm.rs
@@ -246,11 +246,22 @@ impl SchedulerState {
    }
 }

+/// Cancels its token when dropped. Shared via Arc so the background task is
+/// only cancelled when the last Scheduler clone is dropped.
+struct CancelGuard(CancellationToken);
+
+impl Drop for CancelGuard {
+    fn drop(&mut self) {
+        self.0.cancel();
+    }
+}
+
 /// Manages scheduling of requests using KvManager resources
 #[derive(Clone)]
 pub struct Scheduler {
    request_tx: mpsc::UnboundedSender<DirectRequest>,
    metrics_rx: tokio::sync::watch::Receiver<MockerMetrics>,
+    _cancel_guard: Arc<CancelGuard>,
 }

 impl Scheduler {
@@ -273,7 +284,9 @@ impl Scheduler {
        let (metrics_tx, metrics_rx) =
            tokio::sync::watch::channel::<MockerMetrics>(initial_metrics);

-        let cancel_token_clone = cancellation_token.unwrap_or_default().clone();
+        let cancel_token = cancellation_token.unwrap_or_default();
+        let cancel_token_clone = cancel_token.clone();
+        let cancel_guard = Arc::new(CancelGuard(cancel_token));

        // Spawn main background task with cancellation token
        tokio::spawn(async move {
@@ -330,6 +343,7 @@ impl Scheduler {
        Self {
            request_tx,
            metrics_rx,
+            _cancel_guard: cancel_guard,
        }
    }

@@ -360,13 +374,16 @@ async fn receive_requests(
    }

    if state.is_empty() {
-        // Fully idle - block until new request arrives
+        // Fully idle - block until new request arrives or shutdown
        tokio::select! {
            biased;
            _ = cancel_token.cancelled() => {
                return None;
            }
-            Some(request) = request_rx.recv() => {
+            result = request_rx.recv() => {
+                let Some(request) = result else {
+                    return None; // channel closed
+                };
                state.receive(request);
                return Some(());
            }

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -123,7 +123,7 @@ sglang_configs = {
        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        env={
-            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
+            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
        },
        frontend_port=DefaultPort.FRONTEND.value,
        request_payloads=[

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -152,7 +152,7 @@ trtllm_configs = {
            )
        ],
        env={
-            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
+            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
        },
    ),
    "disaggregated_router": TRTLLMConfig(

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -204,7 +204,7 @@ vllm_configs = {
            )
        ],
        env={
-            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
+            "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
        },
    ),
    "agg-router-approx": VLLMConfig(
@@ -235,7 +235,7 @@ vllm_configs = {
            ),
        ],
        env={
-            "DYN_LOG": "dynamo_llm::kv_router::scheduler=info",
+            "DYN_LOG": "dynamo_kv_router::scheduling::selector=info",
        },
    ),
    "disaggregated": VLLMConfig(