feat(mocker): add multi-worker replay and router startup fixes (#7553)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(mocker): add multi-worker replay and router startup fixes (#7553)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
b7fe46b1 · Yan Ru Pei · GitHub · 82794761 · b7fe46b1 · b7fe46b1
Unverified Commit b7fe46b1 authored Mar 23, 2026 by Yan Ru Pei Committed by GitHub Mar 23, 2026
20 changed files
--- a/lib/kv-router/src/sequences/multi_worker.rs
+++ b/lib/kv-router/src/sequences/multi_worker.rs
@@ -689,6 +689,20 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
        results
    }

+    /// Return true if any worker satisfies the provided predicate on active token count.
+    pub fn any_worker_matches_active_tokens(
+        &self,
+        mut predicate: impl FnMut(WorkerWithDpRank, usize) -> bool,
+    ) -> bool {
+        let table = self.workers.read();
+        for (worker, lock) in &table.slots {
+            if predicate(*worker, lock.read().active_tokens()) {
+                return true;
+            }
+        }
+        false
+    }
+
    pub fn get_active_lora_counts(&self) -> HashMap<String, usize> {
        let mut counts: HashMap<String, usize> = HashMap::new();
        for entry in self.request_to_lora.iter() {

--- a/lib/kv-router/src/test_utils.rs
+++ b/lib/kv-router/src/test_utils.rs
@@ -117,7 +117,7 @@ impl SequencePublisher for NoopSequencePublisher {
 }

 /// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct SimpleWorkerConfig {
    pub data_parallel_start_rank: u32,
    pub data_parallel_size: u32,

--- a/lib/llm/src/discovery/model_manager.rs
+++ b/lib/llm/src/discovery/model_manager.rs
@@ -587,16 +587,13 @@ impl ModelManager {
        // Get of create runtime config watcher for this endpoint
        let workers_with_configs = self.get_or_create_runtime_config_watcher(endpoint).await?;

-        let selector = Box::new(DefaultWorkerSelector::new(
-            kv_router_config.clone(),
-            worker_type,
-        ));
+        let selector = DefaultWorkerSelector::new(kv_router_config.clone(), worker_type);
        let chooser = KvRouter::new(
            endpoint.clone(),
            client,
            workers_with_configs,
            kv_cache_block_size,
-            Some(selector),
+            selector,
            kv_router_config,
            worker_type,
            model_name,

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -38,8 +38,6 @@ pub mod metrics;
 pub mod prefill_router;
 pub mod publisher;
 pub mod push_router;
-pub mod queue;
-pub mod recorder;
 pub mod remote_indexer;
 pub mod scheduler;
 pub mod sequence;
@@ -54,7 +52,7 @@ use crate::{
    discovery::RuntimeConfigWatch,
    kv_router::{
        remote_indexer::RemoteIndexer,
-        scheduler::{KvScheduler, PotentialLoad},
+        scheduler::{DefaultWorkerSelector, KvScheduler, PotentialLoad},
        sequence::{SequenceError, SequenceRequest},
    },
    local_model::runtime_config::ModelRuntimeConfig,
@@ -109,10 +107,6 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
    }
 }

-/// Concrete `WorkerSelector` bound to the runtime config type.
-pub type WorkerSelector =
-    dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
-
 #[derive(Clone)]
 pub enum Indexer {
    /// Single-threaded radix tree with channel-based event processing.
@@ -297,23 +291,29 @@ impl Indexer {

 /// A KvRouter only decides which worker you should use. It doesn't send you there.
 /// TODO: Rename this to indicate it only selects a worker, it does not route.
-pub struct KvRouter {
+pub struct KvRouter<Sel = DefaultWorkerSelector>
+where
+    Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
+{
    indexer: Indexer,
-    scheduler: KvScheduler,
+    scheduler: KvScheduler<Sel>,
    block_size: u32,
    kv_router_config: KvRouterConfig,
    cancellation_token: tokio_util::sync::CancellationToken,
    client: Client,
 }

-impl KvRouter {
+impl<Sel> KvRouter<Sel>
+where
+    Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
+{
    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        endpoint: Endpoint,
        client: Client,
        mut workers_with_configs: RuntimeConfigWatch,
        block_size: u32,
-        selector: Option<Box<WorkerSelector>>,
+        selector: Sel,
        kv_router_config: Option<KvRouterConfig>,
        worker_type: &'static str,
        model_name: Option<String>,
@@ -327,10 +327,13 @@ impl KvRouter {

        if !kv_router_config.skip_initial_worker_wait {
            let _ = workers_with_configs
-                .wait_for(|m| !m.is_empty())
+                .wait_for(|m| m.len() >= kv_router_config.min_initial_workers)
                .await
                .map_err(|_| {
-                    anyhow::anyhow!("runtime config watch closed before any workers appeared")
+                    anyhow::anyhow!(
+                        "runtime config watch closed before {} workers appeared",
+                        kv_router_config.min_initial_workers
+                    )
                })?;
        }

@@ -596,7 +599,11 @@ impl KvRouter {
 // NOTE: KVRouter works like a PushRouter,
 // but without the reverse proxy functionality, but based on contract of 3 request types
 #[async_trait]
-impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error> for KvRouter {
+impl<Sel> AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error>
+    for KvRouter<Sel>
+where
+    Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
+{
    async fn generate(
        &self,
        request: SingleIn<RouterRequest>,
@@ -649,7 +656,10 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
    }
 }

-impl Drop for KvRouter {
+impl<Sel> Drop for KvRouter<Sel>
+where
+    Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
+{
    fn drop(&mut self) {
        tracing::info!("Dropping KvRouter - cancelling background tasks");
        self.cancellation_token.cancel();

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
--- a/lib/llm/src/kv_router/queue.rs
+++ b/lib/llm/src/kv_router/queue.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-pub use dynamo_kv_router::queue::DEFAULT_MAX_BATCHED_TOKENS;
-
-use crate::kv_router::sequence::RuntimeSequencePublisher;
-use crate::local_model::runtime_config::ModelRuntimeConfig;
-use dynamo_kv_router::scheduling::policy::RouterSchedulingPolicy;
-
-/// Concrete `SchedulerQueue` wired to the runtime publisher and config types.
-pub type SchedulerQueue = dynamo_kv_router::queue::SchedulerQueue<
-    RuntimeSequencePublisher,
-    ModelRuntimeConfig,
-    RouterSchedulingPolicy,
->;
--- a/lib/llm/src/kv_router/recorder.rs
+++ b/lib/llm/src/kv_router/recorder.rs
--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
--- a/lib/llm/src/mocker.rs
+++ b/lib/llm/src/mocker.rs
--- a/lib/mocker/Cargo.toml
+++ b/lib/mocker/Cargo.toml
@@ -13,7 +13,6 @@ repository.workspace = true
 [dependencies]
 # repo
 dynamo-kv-router = { workspace = true }
-dynamo-runtime = { workspace = true }
 dynamo-tokens = { workspace = true }

 # workspace
@@ -41,3 +40,4 @@ tokio-timerfd = "0.2"

 [dev-dependencies]
 rstest = "0.18.2"
+tempfile = { workspace = true }
--- a/lib/mocker/src/common/kv_cache_trace.rs
+++ b/lib/mocker/src/common/kv_cache_trace.rs
@@ -5,14 +5,15 @@
 //!
 //! Enabled by setting `DYN_MOCKER_KV_CACHE_TRACE=1` or `true`.

-use dynamo_runtime::config::environment_names::mocker;
 use std::env;
 use std::sync::LazyLock;
 use std::time::{SystemTime, UNIX_EPOCH};

+const DYN_MOCKER_KV_CACHE_TRACE: &str = "DYN_MOCKER_KV_CACHE_TRACE";
+
 /// Check the env var to enable KV cache allocation/eviction trace logs.
 pub static KV_CACHE_TRACE_ENABLED: LazyLock<bool> = LazyLock::new(|| {
-    env::var(mocker::DYN_MOCKER_KV_CACHE_TRACE)
+    env::var(DYN_MOCKER_KV_CACHE_TRACE)
        .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
        .unwrap_or(false)
 });

--- a/lib/mocker/src/common/protocols.rs
+++ b/lib/mocker/src/common/protocols.rs
--- a/lib/mocker/src/common/sequence.rs
+++ b/lib/mocker/src/common/sequence.rs
@@ -134,13 +134,12 @@ impl ActiveSequence {
        let hashes = self.block_hashes[hash_start..hash_end].to_vec();

        let token_ids = if self.emit_token_ids && hash_start < hash_end {
-            let all_token_ids: Vec<Vec<u32>> = self
-                .tokens
-                .blocks()
+            Some(
+                self.tokens.blocks()[hash_start..hash_end]
                    .iter()
                    .map(|b| b.tokens().to_vec())
-                .collect();
-            Some(all_token_ids[hash_start..hash_end].to_vec())
+                    .collect(),
+            )
        } else {
            None
        };
@@ -276,13 +275,15 @@ impl ActiveSequence {
        }

        // Free all blocks when we reach max tokens
-        signals.extend(self.free_signal());
+        signals.extend(self.free_signal_for_tokens(self.len()));
        signals
    }

-    /// Free all blocks, generating appropriate signals for each block type
-    pub fn free_signal(&self) -> Vec<MoveBlock> {
-        self.unique_blocks
+    fn free_signal_for_tokens(&self, active_tokens: usize) -> Vec<MoveBlock> {
+        let active_blocks = active_tokens
+            .div_ceil(self.block_size)
+            .min(self.unique_blocks.len());
+        self.unique_blocks[..active_blocks]
            .iter()
            .rev()
            .map(|block| match block {
@@ -296,6 +297,11 @@ impl ActiveSequence {
            .collect()
    }

+    /// Free the currently active allocation footprint.
+    pub fn free_signal(&self) -> Vec<MoveBlock> {
+        self.free_signal_for_tokens(self.num_allocated_tokens)
+    }
+
    /// Move the request to a preempted state and return the free signals from freeing current blocks.
    /// Upon preemption, the sequence retains the tokens generated during the decode phase (if any).
    /// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch.

--- a/lib/mocker/src/engine.rs
+++ b/lib/mocker/src/engine.rs
--- a/lib/mocker/src/kv_manager/sglang_backend.rs
+++ b/lib/mocker/src/kv_manager/sglang_backend.rs
--- a/lib/mocker/src/kv_manager/vllm_backend.rs
+++ b/lib/mocker/src/kv_manager/vllm_backend.rs
--- a/lib/mocker/src/lib.rs
+++ b/lib/mocker/src/lib.rs
@@ -11,5 +11,5 @@ pub mod cache;
 pub mod common;
 pub mod engine;
 pub mod kv_manager;
+pub mod replay;
 pub mod scheduler;
-pub mod simulation;
--- a/lib/mocker/src/simulation.rs
+++ b/lib/mocker/src/simulation.rs
--- a/lib/mocker/src/replay/entrypoints.rs
+++ b/lib/mocker/src/replay/entrypoints.rs
--- a/lib/mocker/src/replay/loader.rs
+++ b/lib/mocker/src/replay/loader.rs