Unverified Commit b7fe46b1 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(mocker): add multi-worker replay and router startup fixes (#7553)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 82794761
......@@ -689,6 +689,20 @@ impl<P: SequencePublisher + 'static> ActiveSequencesMultiWorker<P> {
results
}
/// Return true if any worker satisfies the provided predicate on active token count.
pub fn any_worker_matches_active_tokens(
&self,
mut predicate: impl FnMut(WorkerWithDpRank, usize) -> bool,
) -> bool {
let table = self.workers.read();
for (worker, lock) in &table.slots {
if predicate(*worker, lock.read().active_tokens()) {
return true;
}
}
false
}
pub fn get_active_lora_counts(&self) -> HashMap<String, usize> {
let mut counts: HashMap<String, usize> = HashMap::new();
for entry in self.request_to_lora.iter() {
......
......@@ -117,7 +117,7 @@ impl SequencePublisher for NoopSequencePublisher {
}
/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SimpleWorkerConfig {
pub data_parallel_start_rank: u32,
pub data_parallel_size: u32,
......
......@@ -587,16 +587,13 @@ impl ModelManager {
// Get of create runtime config watcher for this endpoint
let workers_with_configs = self.get_or_create_runtime_config_watcher(endpoint).await?;
let selector = Box::new(DefaultWorkerSelector::new(
kv_router_config.clone(),
worker_type,
));
let selector = DefaultWorkerSelector::new(kv_router_config.clone(), worker_type);
let chooser = KvRouter::new(
endpoint.clone(),
client,
workers_with_configs,
kv_cache_block_size,
Some(selector),
selector,
kv_router_config,
worker_type,
model_name,
......
......@@ -38,8 +38,6 @@ pub mod metrics;
pub mod prefill_router;
pub mod publisher;
pub mod push_router;
pub mod queue;
pub mod recorder;
pub mod remote_indexer;
pub mod scheduler;
pub mod sequence;
......@@ -54,7 +52,7 @@ use crate::{
discovery::RuntimeConfigWatch,
kv_router::{
remote_indexer::RemoteIndexer,
scheduler::{KvScheduler, PotentialLoad},
scheduler::{DefaultWorkerSelector, KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest},
},
local_model::runtime_config::ModelRuntimeConfig,
......@@ -109,10 +107,6 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
}
}
/// Concrete `WorkerSelector` bound to the runtime config type.
pub type WorkerSelector =
dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
#[derive(Clone)]
pub enum Indexer {
/// Single-threaded radix tree with channel-based event processing.
......@@ -297,23 +291,29 @@ impl Indexer {
/// A KvRouter only decides which worker you should use. It doesn't send you there.
/// TODO: Rename this to indicate it only selects a worker, it does not route.
pub struct KvRouter {
pub struct KvRouter<Sel = DefaultWorkerSelector>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
{
indexer: Indexer,
scheduler: KvScheduler,
scheduler: KvScheduler<Sel>,
block_size: u32,
kv_router_config: KvRouterConfig,
cancellation_token: tokio_util::sync::CancellationToken,
client: Client,
}
impl KvRouter {
impl<Sel> KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
{
#[allow(clippy::too_many_arguments)]
pub async fn new(
endpoint: Endpoint,
client: Client,
mut workers_with_configs: RuntimeConfigWatch,
block_size: u32,
selector: Option<Box<WorkerSelector>>,
selector: Sel,
kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str,
model_name: Option<String>,
......@@ -327,10 +327,13 @@ impl KvRouter {
if !kv_router_config.skip_initial_worker_wait {
let _ = workers_with_configs
.wait_for(|m| !m.is_empty())
.wait_for(|m| m.len() >= kv_router_config.min_initial_workers)
.await
.map_err(|_| {
anyhow::anyhow!("runtime config watch closed before any workers appeared")
anyhow::anyhow!(
"runtime config watch closed before {} workers appeared",
kv_router_config.min_initial_workers
)
})?;
}
......@@ -596,7 +599,11 @@ impl KvRouter {
// NOTE: KVRouter works like a PushRouter,
// but without the reverse proxy functionality, but based on contract of 3 request types
#[async_trait]
impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error> for KvRouter {
impl<Sel> AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error>
for KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync + 'static,
{
async fn generate(
&self,
request: SingleIn<RouterRequest>,
......@@ -649,7 +656,10 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
}
}
impl Drop for KvRouter {
impl<Sel> Drop for KvRouter<Sel>
where
Sel: dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig>,
{
fn drop(&mut self) {
tracing::info!("Dropping KvRouter - cancelling background tasks");
self.cancellation_token.cancel();
......
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub use dynamo_kv_router::queue::DEFAULT_MAX_BATCHED_TOKENS;
use crate::kv_router::sequence::RuntimeSequencePublisher;
use crate::local_model::runtime_config::ModelRuntimeConfig;
use dynamo_kv_router::scheduling::policy::RouterSchedulingPolicy;
/// Concrete `SchedulerQueue` wired to the runtime publisher and config types.
pub type SchedulerQueue = dynamo_kv_router::queue::SchedulerQueue<
RuntimeSequencePublisher,
ModelRuntimeConfig,
RouterSchedulingPolicy,
>;
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -13,7 +13,6 @@ repository.workspace = true
[dependencies]
# repo
dynamo-kv-router = { workspace = true }
dynamo-runtime = { workspace = true }
dynamo-tokens = { workspace = true }
# workspace
......@@ -41,3 +40,4 @@ tokio-timerfd = "0.2"
[dev-dependencies]
rstest = "0.18.2"
tempfile = { workspace = true }
......@@ -5,14 +5,15 @@
//!
//! Enabled by setting `DYN_MOCKER_KV_CACHE_TRACE=1` or `true`.
use dynamo_runtime::config::environment_names::mocker;
use std::env;
use std::sync::LazyLock;
use std::time::{SystemTime, UNIX_EPOCH};
const DYN_MOCKER_KV_CACHE_TRACE: &str = "DYN_MOCKER_KV_CACHE_TRACE";
/// Check the env var to enable KV cache allocation/eviction trace logs.
pub static KV_CACHE_TRACE_ENABLED: LazyLock<bool> = LazyLock::new(|| {
env::var(mocker::DYN_MOCKER_KV_CACHE_TRACE)
env::var(DYN_MOCKER_KV_CACHE_TRACE)
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false)
});
......
This diff is collapsed.
......@@ -134,13 +134,12 @@ impl ActiveSequence {
let hashes = self.block_hashes[hash_start..hash_end].to_vec();
let token_ids = if self.emit_token_ids && hash_start < hash_end {
let all_token_ids: Vec<Vec<u32>> = self
.tokens
.blocks()
Some(
self.tokens.blocks()[hash_start..hash_end]
.iter()
.map(|b| b.tokens().to_vec())
.collect();
Some(all_token_ids[hash_start..hash_end].to_vec())
.collect(),
)
} else {
None
};
......@@ -276,13 +275,15 @@ impl ActiveSequence {
}
// Free all blocks when we reach max tokens
signals.extend(self.free_signal());
signals.extend(self.free_signal_for_tokens(self.len()));
signals
}
/// Free all blocks, generating appropriate signals for each block type
pub fn free_signal(&self) -> Vec<MoveBlock> {
self.unique_blocks
fn free_signal_for_tokens(&self, active_tokens: usize) -> Vec<MoveBlock> {
let active_blocks = active_tokens
.div_ceil(self.block_size)
.min(self.unique_blocks.len());
self.unique_blocks[..active_blocks]
.iter()
.rev()
.map(|block| match block {
......@@ -296,6 +297,11 @@ impl ActiveSequence {
.collect()
}
/// Free the currently active allocation footprint.
pub fn free_signal(&self) -> Vec<MoveBlock> {
self.free_signal_for_tokens(self.num_allocated_tokens)
}
/// Move the request to a preempted state and return the free signals from freeing current blocks.
/// Upon preemption, the sequence retains the tokens generated during the decode phase (if any).
/// Resets `num_allocated_tokens` so re-admission will re-allocate from scratch.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -11,5 +11,5 @@ pub mod cache;
pub mod common;
pub mod engine;
pub mod kv_manager;
pub mod replay;
pub mod scheduler;
pub mod simulation;
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment