Unverified Commit b7fe46b1 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add multi-worker replay and router startup fixes (#7553)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 82794761
...@@ -689,6 +689,20 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> { ...@@ -689,6 +689,20 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
results results
} }
/// Return true if any worker satisfies the provided predicate on active token count.
pub fn any_worker_matches_active_tokens(
&self,
mut predicate: impl FnMut(WorkerWithDpRank, usize) -> bool,
) -> bool {
let table = self.workers.read();
for (worker, lock) in &table.slots {
if predicate(*worker, lock.read().active_tokens()) {
return true;
}
}
false
}
pub fn get_active_lora_counts(&self) -> HashMap<String, usize> { pub fn get_active_lora_counts(&self) -> HashMap<String, usize> {
let mut counts: HashMap<String, usize> = HashMap::new(); let mut counts: HashMap<String, usize> = HashMap::new();
for entry in self.request_to_lora.iter() { for entry in self.request_to_lora.iter() {
......
...@@ -117,7 +117,7 @@ impl SequencePublisher for NoopSequencePublisher { ...@@ -117,7 +117,7 @@ impl SequencePublisher for NoopSequencePublisher {
} }
/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks. /// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
#[derive(Debug, Clone)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct SimpleWorkerConfig { pub struct SimpleWorkerConfig {
pub data_parallel_start_rank: u32, pub data_parallel_start_rank: u32,
pub data_parallel_size: u32, pub data_parallel_size: u32,
......
...@@ -587,16 +587,13 @@ impl ModelManager { ...@@ -587,16 +587,13 @@ impl ModelManager {
// Get of create runtime config watcher for this endpoint // Get of create runtime config watcher for this endpoint
let workers_with_configs = self.get_or_create_runtime_config_watcher(endpoint).await?; let workers_with_configs = self.get_or_create_runtime_config_watcher(endpoint).await?;
let selector = Box::new(DefaultWorkerSelector::new( let selector = DefaultWorkerSelector::new(kv_router_config.clone(), worker_type);
kv_router_config.clone(),
worker_type,
));
let chooser = KvRouter::new( let chooser = KvRouter::new(
endpoint.clone(), endpoint.clone(),
client, client,
workers_with_configs, workers_with_configs,
kv_cache_block_size, kv_cache_block_size,
Some(selector), selector,
kv_router_config, kv_router_config,
worker_type, worker_type,
model_name, model_name,
......
...@@ -38,8 +38,6 @@ pub mod metrics; ...@@ -38,8 +38,6 @@ pub mod metrics;
pub mod prefill_router; pub mod prefill_router;
pub mod publisher; pub mod publisher;
pub mod push_router; pub mod push_router;
pub mod queue;
pub mod recorder;
pub mod remote_indexer; pub mod remote_indexer;
pub mod scheduler; pub mod scheduler;
pub mod sequence; pub mod sequence;
...@@ -54,7 +52,7 @@ use crate::{ ...@@ -54,7 +52,7 @@ use crate::{
discovery::RuntimeConfigWatch, discovery::RuntimeConfigWatch,
kv_router::{ kv_router::{
remote_indexer::RemoteIndexer, remote_indexer::RemoteIndexer,
scheduler::{KvScheduler, PotentialLoad}, scheduler::{DefaultWorkerSelector, KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest}, sequence::{SequenceError, SequenceRequest},
}, },
local_model::runtime_config::ModelRuntimeConfig, local_model::runtime_config::ModelRuntimeConfig,
...@@ -109,10 +107,6 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery ...@@ -109,10 +107,6 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
} }
} }
/// Concrete `WorkerSelector` bound to the runtime config type.
pub type WorkerSelector =
dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
#[derive(Clone)] #[derive(Clone)]
pub enum Indexer { pub enum Indexer {
/// Single-threaded radix tree with channel-based event processing. /// Single-threaded radix tree with channel-based event processing.
...@@ -297,23 +291,29 @@ impl Indexer { ...@@ -297,23 +291,29 @@ impl Indexer {
/// A KvRouter only decides which worker you should use. It doesn't send you there. /// A KvRouter only decides which worker you should use. It doesn't send you there.
/// TODO: Rename this to indicate it only selects a worker, it does not route. /// TODO: Rename this to indicate it only selects a worker, it does not route.
pub struct KvRouter { pub struct KvRouter<Sel = DefaultWorkerSelector>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
{
indexer: Indexer, indexer: Indexer,
scheduler: KvScheduler, scheduler: KvScheduler<Sel>,
block_size: u32, block_size: u32,
kv_router_config: KvRouterConfig, kv_router_config: KvRouterConfig,
cancellation_token: tokio_util::sync::CancellationToken, cancellation_token: tokio_util::sync::CancellationToken,
client: Client, client: Client,
} }
impl KvRouter { impl<Sel> KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
{
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub async fn new( pub async fn new(
endpoint: Endpoint, endpoint: Endpoint,
client: Client, client: Client,
mut workers_with_configs: RuntimeConfigWatch, mut workers_with_configs: RuntimeConfigWatch,
block_size: u32, block_size: u32,
selector: Option<Box<WorkerSelector>>, selector: Sel,
kv_router_config: Option<KvRouterConfig>, kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str, worker_type: &'static str,
model_name: Option<String>, model_name: Option<String>,
...@@ -327,10 +327,13 @@ impl KvRouter { ...@@ -327,10 +327,13 @@ impl KvRouter {
if !kv_router_config.skip_initial_worker_wait { if !kv_router_config.skip_initial_worker_wait {
let _ = workers_with_configs let _ = workers_with_configs
.wait_for(|m| !m.is_empty()) .wait_for(|m| m.len() >= kv_router_config.min_initial_workers)
.await .await
.map_err(|_| { .map_err(|_| {
anyhow::anyhow!("runtime config watch closed before any workers appeared") anyhow::anyhow!(
"runtime config watch closed before {} workers appeared",
kv_router_config.min_initial_workers
)
})?; })?;
} }
...@@ -596,7 +599,11 @@ impl KvRouter { ...@@ -596,7 +599,11 @@ impl KvRouter {
// NOTE: KVRouter works like a PushRouter, // NOTE: KVRouter works like a PushRouter,
// but without the reverse proxy functionality, but based on contract of 3 request types // but without the reverse proxy functionality, but based on contract of 3 request types
#[async_trait] #[async_trait]
impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error> for KvRouter { impl<Sel> AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error>
for KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
{
async fn generate( async fn generate(
&self, &self,
request: SingleIn<RouterRequest>, request: SingleIn<RouterRequest>,
...@@ -649,7 +656,10 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er ...@@ -649,7 +656,10 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
} }
} }
impl Drop for KvRouter { impl<Sel> Drop for KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
{
fn drop(&mut self) { fn drop(&mut self) {
tracing::info!("Dropping KvRouter - cancelling background tasks"); tracing::info!("Dropping KvRouter - cancelling background tasks");
self.cancellation_token.cancel(); self.cancellation_token.cancel();
......
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub use dynamo_kv_router::queue::DEFAULT_MAX_BATCHED_TOKENS;
use crate::kv_router::sequence::RuntimeSequencePublisher;
use crate::local_model::runtime_config::ModelRuntimeConfig;
use dynamo_kv_router::scheduling::policy::RouterSchedulingPolicy;
/// Concrete `SchedulerQueue` wired to the runtime publisher and config types.
pub type SchedulerQueue = dynamo_kv_router::queue::SchedulerQueue<
RuntimeSequencePublisher,
ModelRuntimeConfig,
RouterSchedulingPolicy,
>;
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -13,7 +13,6 @@ repository.workspace = true ...@@ -13,7 +13,6 @@ repository.workspace = true
[dependencies] [dependencies]
# repo # repo
dynamo-kv-router = { workspace = true } dynamo-kv-router = { workspace = true }
dynamo-runtime = { workspace = true }
dynamo-tokens = { workspace = true } dynamo-tokens = { workspace = true }
# workspace # workspace
...@@ -41,3 +40,4 @@ tokio-timerfd = "0.2" ...@@ -41,3 +40,4 @@ tokio-timerfd = "0.2"
[dev-dependencies] [dev-dependencies]
rstest = "0.18.2" rstest = "0.18.2"
tempfile = { workspace = true }
...@@ -5,14 +5,15 @@ ...@@ -5,14 +5,15 @@
//! //!
//! Enabled by setting `DYN_MOCKER_KV_CACHE_TRACE=1` or `true`. //! Enabled by setting `DYN_MOCKER_KV_CACHE_TRACE=1` or `true`.
use dynamo_runtime::config::environment_names::mocker;
use std::env; use std::env;
use std::sync::LazyLock; use std::sync::LazyLock;
use std::time::{SystemTime, UNIX_EPOCH}; use std::time::{SystemTime, UNIX_EPOCH};
const DYN_MOCKER_KV_CACHE_TRACE: &str = "DYN_MOCKER_KV_CACHE_TRACE";
/// Check the env var to enable KV cache allocation/eviction trace logs. /// Check the env var to enable KV cache allocation/eviction trace logs.
pub static KV_CACHE_TRACE_ENABLED: LazyLock<bool> = LazyLock::new(|| { pub static KV_CACHE_TRACE_ENABLED: LazyLock<bool> = LazyLock::new(|| {
env::var(mocker::DYN_MOCKER_KV_CACHE_TRACE) env::var(DYN_MOCKER_KV_CACHE_TRACE)
.map(|v| v == "1" || v.eq_ignore_ascii_case("true")) .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false) .unwrap_or(false)
}); });
......
This diff is collapsed.
...@@ -134,13 +134,12 @@ impl ActiveSequence { ...@@ -134,13 +134,12 @@ impl ActiveSequence {
let hashes = self.block_hashes[hash_start..hash_end].to_vec(); let hashes = self.block_hashes[hash_start..hash_end].to_vec();
let token_ids = if self.emit_token_ids && hash_start < hash_end { let token_ids = if self.emit_token_ids && hash_start < hash_end {
let all_token_ids: Vec<Vec<u32>> = self Some(
.tokens self.tokens.blocks()[hash_start..hash_end]
.blocks()
.iter() .iter()
.map(|b| b.tokens().to_vec()) .map(|b| b.tokens().to_vec())
.collect(); .collect(),
Some(all_token_ids[hash_start..hash_end].to_vec()) )
} else { } else {
None None
}; };
...@@ -276,13 +275,15 @@ impl ActiveSequence { ...@@ -276,13 +275,15 @@ impl ActiveSequence {
} }
// Free all blocks when we reach max tokens // Free all blocks when we reach max tokens
signals.extend(self.free_signal()); signals.extend(self.free_signal_for_tokens(self.len()));
signals signals
} }
/// Free all blocks, generating appropriate signals for each block type fn free_signal_for_tokens(&self, active_tokens: usize) -> Vec<MoveBlock> {
pub fn free_signal(&self) -> Vec<MoveBlock> { let active_blocks = active_tokens
self.unique_blocks .div_ceil(self.block_size)
.min(self.unique_blocks.len());
self.unique_blocks[..active_blocks]
.iter() .iter()
.rev() .rev()
.map(|block| match block { .map(|block| match block {
...@@ -296,6 +297,11 @@ impl ActiveSequence { ...@@ -296,6 +297,11 @@ impl ActiveSequence {
.collect() .collect()
} }
/// Free the currently active allocation footprint.
pub fn free_signal(&self) -> Vec<MoveBlock> {
self.free_signal_for_tokens(self.num_allocated_tokens)
}
/// Move the request to a preempted state and return the free signals from freeing current blocks. /// Move the request to a preempted state and return the free signals from freeing current blocks.
/// Upon preemption, the sequence retains the tokens generated during the decode phase (if any). /// Upon preemption, the sequence retains the tokens generated during the decode phase (if any).
/// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch. /// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -11,5 +11,5 @@ pub mod cache; ...@@ -11,5 +11,5 @@ pub mod cache;
pub mod common; pub mod common;
pub mod engine; pub mod engine;
pub mod kv_manager; pub mod kv_manager;
pub mod replay;
pub mod scheduler; pub mod scheduler;
pub mod simulation;
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment