"docs/vscode:/vscode.git/clone" did not exist on "52b460e4c66f846593aed293ffe5f964ce527e68"
Unverified Commit e5850e23 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent b302ec41
...@@ -3,10 +3,14 @@ ...@@ -3,10 +3,14 @@
//! Shared test utilities for radix tree tests. //! Shared test utilities for radix tree tests.
use std::future;
use crate::protocols::{ use crate::protocols::{
ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData, ActiveLoad, ActiveSequenceEvent, ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData,
KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId, KvCacheRemoveData, KvCacheStoreData, KvCacheStoredBlockData, LocalBlockHash, RouterEvent,
WorkerConfigLike, WorkerId, WorkerWithDpRank,
}; };
use crate::sequences::SequencePublisher;
/// Creates blocks with artificial hash mapping (hash * 100) for testing. /// Creates blocks with artificial hash mapping (hash * 100) for testing.
pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> { pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> {
...@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>) ...@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>)
}, },
} }
} }
/// No-op [`SequencePublisher`] for tests and benchmarks that don't need event transport.
pub struct NoopSequencePublisher;
impl SequencePublisher for NoopSequencePublisher {
fn publish_event(
&self,
_event: &ActiveSequenceEvent,
) -> impl future::Future<Output = anyhow::Result<()>> + Send {
future::ready(Ok(()))
}
fn publish_load(&self, _load: ActiveLoad) {}
fn observe_load(&self, _: &WorkerWithDpRank, _: &str, _: usize, _: usize) {}
}
/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
#[derive(Debug, Clone)]
pub struct SimpleWorkerConfig {
pub data_parallel_size: u32,
pub max_num_batched_tokens: Option<u64>,
pub total_kv_blocks: Option<u64>,
}
impl Default for SimpleWorkerConfig {
fn default() -> Self {
Self {
data_parallel_size: 1,
max_num_batched_tokens: None,
total_kv_blocks: None,
}
}
}
impl WorkerConfigLike for SimpleWorkerConfig {
fn data_parallel_size(&self) -> u32 {
self.data_parallel_size
}
fn max_num_batched_tokens(&self) -> Option<u64> {
self.max_num_batched_tokens
}
fn total_kv_blocks(&self) -> Option<u64> {
self.total_kv_blocks
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
...@@ -27,6 +26,8 @@ use validator::Validate; ...@@ -27,6 +26,8 @@ use validator::Validate;
pub use dynamo_kv_router::approx; pub use dynamo_kv_router::approx;
pub use dynamo_kv_router::indexer; pub use dynamo_kv_router::indexer;
pub use dynamo_kv_router::protocols; pub use dynamo_kv_router::protocols;
pub use dynamo_kv_router::scheduling;
pub use dynamo_kv_router::selector;
pub mod cache_control; pub mod cache_control;
pub mod config; pub mod config;
...@@ -56,10 +57,10 @@ use crate::{ ...@@ -56,10 +57,10 @@ use crate::{
indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError}, indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError},
protocols::{ protocols::{
BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest, BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest,
RouterResponse, TokensWithHashes, WorkerId, WorkerSelectionResult, WorkerWithDpRank, RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
compute_block_hash_for_seq, compute_block_hash_for_seq,
}, },
scheduler::{KvScheduler, KvSchedulerError, PotentialLoad, SchedulingRequest}, scheduler::{KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest}, sequence::{SequenceError, SequenceRequest},
}, },
local_model::runtime_config::ModelRuntimeConfig, local_model::runtime_config::ModelRuntimeConfig,
...@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery ...@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
} }
} }
/// A trait that users can implement to define custom selection logic /// Concrete `WorkerSelector` bound to the runtime config type.
pub trait WorkerSelector { pub type WorkerSelector =
fn select_worker( dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
&self,
workers: &HashMap<protocols::WorkerId, ModelRuntimeConfig>,
request: &SchedulingRequest,
block_size: u32,
) -> Result<WorkerSelectionResult, KvSchedulerError>;
}
#[derive(Clone)] #[derive(Clone)]
pub enum Indexer { pub enum Indexer {
...@@ -297,7 +292,7 @@ impl KvRouter { ...@@ -297,7 +292,7 @@ impl KvRouter {
client: Client, client: Client,
mut workers_with_configs: RuntimeConfigWatch, mut workers_with_configs: RuntimeConfigWatch,
block_size: u32, block_size: u32,
selector: Option<Box<dyn WorkerSelector + Send + Sync>>, selector: Option<Box<WorkerSelector>>,
kv_router_config: Option<KvRouterConfig>, kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str, worker_type: &'static str,
) -> Result<Self> { ) -> Result<Self> {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -80,6 +80,20 @@ impl Default for ModelRuntimeConfig { ...@@ -80,6 +80,20 @@ impl Default for ModelRuntimeConfig {
} }
} }
impl dynamo_kv_router::WorkerConfigLike for ModelRuntimeConfig {
fn data_parallel_size(&self) -> u32 {
self.data_parallel_size
}
fn max_num_batched_tokens(&self) -> Option<u64> {
self.max_num_batched_tokens
}
fn total_kv_blocks(&self) -> Option<u64> {
self.total_kv_blocks
}
}
impl ModelRuntimeConfig { impl ModelRuntimeConfig {
pub fn new() -> Self { pub fn new() -> Self {
Self::default() Self::default()
......
...@@ -214,6 +214,8 @@ pub struct MockVllmEngine { ...@@ -214,6 +214,8 @@ pub struct MockVllmEngine {
engine_args: MockEngineArgs, engine_args: MockEngineArgs,
/// Bootstrap server for prefill workers in disaggregated mode /// Bootstrap server for prefill workers in disaggregated mode
bootstrap_server: Arc<OnceCell<Arc<BootstrapServer>>>, bootstrap_server: Arc<OnceCell<Arc<BootstrapServer>>>,
/// Keep schedulers alive so their CancelGuards don't fire prematurely.
_schedulers: OnceCell<Vec<Scheduler>>,
} }
impl MockVllmEngine { impl MockVllmEngine {
...@@ -225,6 +227,7 @@ impl MockVllmEngine { ...@@ -225,6 +227,7 @@ impl MockVllmEngine {
senders_ready: Notify::new(), senders_ready: Notify::new(),
engine_args, engine_args,
bootstrap_server: Arc::new(OnceCell::new()), bootstrap_server: Arc::new(OnceCell::new()),
_schedulers: OnceCell::new(),
} }
} }
...@@ -268,6 +271,8 @@ impl MockVllmEngine { ...@@ -268,6 +271,8 @@ impl MockVllmEngine {
Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?; Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?;
let _ = self._schedulers.set(schedulers);
Ok(()) Ok(())
} }
......
...@@ -246,11 +246,22 @@ impl SchedulerState { ...@@ -246,11 +246,22 @@ impl SchedulerState {
} }
} }
/// Cancels its token when dropped. Shared via Arc so the background task is
/// only cancelled when the last Scheduler clone is dropped.
struct CancelGuard(CancellationToken);
impl Drop for CancelGuard {
fn drop(&mut self) {
self.0.cancel();
}
}
/// Manages scheduling of requests using KvManager resources /// Manages scheduling of requests using KvManager resources
#[derive(Clone)] #[derive(Clone)]
pub struct Scheduler { pub struct Scheduler {
request_tx: mpsc::UnboundedSender<DirectRequest>, request_tx: mpsc::UnboundedSender<DirectRequest>,
metrics_rx: tokio::sync::watch::Receiver<MockerMetrics>, metrics_rx: tokio::sync::watch::Receiver<MockerMetrics>,
_cancel_guard: Arc<CancelGuard>,
} }
impl Scheduler { impl Scheduler {
...@@ -273,7 +284,9 @@ impl Scheduler { ...@@ -273,7 +284,9 @@ impl Scheduler {
let (metrics_tx, metrics_rx) = let (metrics_tx, metrics_rx) =
tokio::sync::watch::channel::<MockerMetrics>(initial_metrics); tokio::sync::watch::channel::<MockerMetrics>(initial_metrics);
let cancel_token_clone = cancellation_token.unwrap_or_default().clone(); let cancel_token = cancellation_token.unwrap_or_default();
let cancel_token_clone = cancel_token.clone();
let cancel_guard = Arc::new(CancelGuard(cancel_token));
// Spawn main background task with cancellation token // Spawn main background task with cancellation token
tokio::spawn(async move { tokio::spawn(async move {
...@@ -330,6 +343,7 @@ impl Scheduler { ...@@ -330,6 +343,7 @@ impl Scheduler {
Self { Self {
request_tx, request_tx,
metrics_rx, metrics_rx,
_cancel_guard: cancel_guard,
} }
} }
...@@ -360,13 +374,16 @@ async fn receive_requests( ...@@ -360,13 +374,16 @@ async fn receive_requests(
} }
if state.is_empty() { if state.is_empty() {
// Fully idle - block until new request arrives // Fully idle - block until new request arrives or shutdown
tokio::select! { tokio::select! {
biased; biased;
_ = cancel_token.cancelled() => { _ = cancel_token.cancelled() => {
return None; return None;
} }
Some(request) = request_rx.recv() => { result = request_rx.recv() => {
let Some(request) = result else {
return None; // channel closed
};
state.receive(request); state.receive(request);
return Some(()); return Some(());
} }
......
...@@ -123,7 +123,7 @@ sglang_configs = { ...@@ -123,7 +123,7 @@ sglang_configs = {
marks=[pytest.mark.gpu_2, pytest.mark.post_merge], marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={ env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info", "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
}, },
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[ request_payloads=[
......
...@@ -152,7 +152,7 @@ trtllm_configs = { ...@@ -152,7 +152,7 @@ trtllm_configs = {
) )
], ],
env={ env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info", "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
}, },
), ),
"disaggregated_router": TRTLLMConfig( "disaggregated_router": TRTLLMConfig(
......
...@@ -204,7 +204,7 @@ vllm_configs = { ...@@ -204,7 +204,7 @@ vllm_configs = {
) )
], ],
env={ env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info", "DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
}, },
), ),
"agg-router-approx": VLLMConfig( "agg-router-approx": VLLMConfig(
...@@ -235,7 +235,7 @@ vllm_configs = { ...@@ -235,7 +235,7 @@ vllm_configs = {
), ),
], ],
env={ env={
"DYN_LOG": "dynamo_llm::kv_router::scheduler=info", "DYN_LOG": "dynamo_kv_router::scheduling::selector=info",
}, },
), ),
"disaggregated": VLLMConfig( "disaggregated": VLLMConfig(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment