Unverified Commit e5850e23 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent b302ec41
......@@ -3,10 +3,14 @@
//! Shared test utilities for radix tree tests.
use std::future;
use crate::protocols::{
ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData,
KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId,
ActiveLoad, ActiveSequenceEvent, ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData,
KvCacheRemoveData, KvCacheStoreData, KvCacheStoredBlockData, LocalBlockHash, RouterEvent,
WorkerConfigLike, WorkerId, WorkerWithDpRank,
};
use crate::sequences::SequencePublisher;
/// Creates blocks with artificial hash mapping (hash * 100) for testing.
pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> {
......@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>)
},
}
}
/// No-op [`SequencePublisher`] for tests and benchmarks that don't need event transport.
pub struct NoopSequencePublisher;
impl SequencePublisher for NoopSequencePublisher {
fn publish_event(
&self,
_event: &ActiveSequenceEvent,
) -> impl future::Future<Output = anyhow::Result<()>> + Send {
future::ready(Ok(()))
}
fn publish_load(&self, _load: ActiveLoad) {}
fn observe_load(&self, _: &WorkerWithDpRank, _: &str, _: usize, _: usize) {}
}
/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
#[derive(Debug, Clone)]
pub struct SimpleWorkerConfig {
pub data_parallel_size: u32,
pub max_num_batched_tokens: Option<u64>,
pub total_kv_blocks: Option<u64>,
}
impl Default for SimpleWorkerConfig {
fn default() -> Self {
Self {
data_parallel_size: 1,
max_num_batched_tokens: None,
total_kv_blocks: None,
}
}
}
impl WorkerConfigLike for SimpleWorkerConfig {
fn data_parallel_size(&self) -> u32 {
self.data_parallel_size
}
fn max_num_batched_tokens(&self) -> Option<u64> {
self.max_num_batched_tokens
}
fn total_kv_blocks(&self) -> Option<u64> {
self.total_kv_blocks
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
......@@ -27,6 +26,8 @@ use validator::Validate;
pub use dynamo_kv_router::approx;
pub use dynamo_kv_router::indexer;
pub use dynamo_kv_router::protocols;
pub use dynamo_kv_router::scheduling;
pub use dynamo_kv_router::selector;
pub mod cache_control;
pub mod config;
......@@ -56,10 +57,10 @@ use crate::{
indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError},
protocols::{
BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest,
RouterResponse, TokensWithHashes, WorkerId, WorkerSelectionResult, WorkerWithDpRank,
RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
compute_block_hash_for_seq,
},
scheduler::{KvScheduler, KvSchedulerError, PotentialLoad, SchedulingRequest},
scheduler::{KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest},
},
local_model::runtime_config::ModelRuntimeConfig,
......@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
}
}
/// A trait that users can implement to define custom selection logic
pub trait WorkerSelector {
fn select_worker(
&self,
workers: &HashMap<protocols::WorkerId, ModelRuntimeConfig>,
request: &SchedulingRequest,
block_size: u32,
) -> Result<WorkerSelectionResult, KvSchedulerError>;
}
/// Concrete `WorkerSelector` bound to the runtime config type.
pub type WorkerSelector =
dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
#[derive(Clone)]
pub enum Indexer {
......@@ -297,7 +292,7 @@ impl KvRouter {
client: Client,
mut workers_with_configs: RuntimeConfigWatch,
block_size: u32,
selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
selector: Option<Box<WorkerSelector>>,
kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str,
) -> Result<Self> {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -80,6 +80,20 @@ impl Default for ModelRuntimeConfig {
}
}
impl dynamo_kv_router::WorkerConfigLike for ModelRuntimeConfig {
fn data_parallel_size(&self) -> u32 {
self.data_parallel_size
}
fn max_num_batched_tokens(&self) -> Option<u64> {
self.max_num_batched_tokens
}
fn total_kv_blocks(&self) -> Option<u64> {
self.total_kv_blocks
}
}
impl ModelRuntimeConfig {
pub fn new() -> Self {
Self::default()
......
......@@ -214,6 +214,8 @@ pub struct MockVllmEngine {
engine_args: MockEngineArgs,
/// Bootstrap server for prefill workers in disaggregated mode
bootstrap_server: Arc<OnceCell<Arc<BootstrapServer>>>,
/// Keep schedulers alive so their CancelGuards don't fire prematurely.
_schedulers: OnceCell<Vec<Scheduler>>,
}
impl MockVllmEngine {
......@@ -225,6 +227,7 @@ impl MockVllmEngine {
senders_ready: Notify::new(),
engine_args,
bootstrap_server: Arc::new(OnceCell::new()),
_schedulers: OnceCell::new(),
}
}
......@@ -268,6 +271,8 @@ impl MockVllmEngine {
Self::start_metrics_publishing(&schedulers, component, cancel_token.clone()).await?;
let _ = self._schedulers.set(schedulers);
Ok(())
}
......
......@@ -246,11 +246,22 @@ impl SchedulerState {
}
}
/// Cancels its token when dropped. Shared via Arc so the background task is
/// only cancelled when the last Scheduler clone is dropped.
struct CancelGuard(CancellationToken);
impl Drop for CancelGuard {
fn drop(&mut self) {
self.0.cancel();
}
}
/// Manages scheduling of requests using KvManager resources
#[derive(Clone)]
pub struct Scheduler {
request_tx: mpsc::UnboundedSender<DirectRequest>,
metrics_rx: tokio::sync::watch::Receiver<MockerMetrics>,
_cancel_guard: Arc<CancelGuard>,
}
impl Scheduler {
......@@ -273,7 +284,9 @@ impl Scheduler {
let (metrics_tx, metrics_rx) =
tokio::sync::watch::channel::<MockerMetrics>(initial_metrics);
let cancel_token_clone = cancellation_token.unwrap_or_default().clone();
let cancel_token = cancellation_token.unwrap_or_default();
let cancel_token_clone = cancel_token.clone();
let cancel_guard = Arc::new(CancelGuard(cancel_token));
// Spawn main background task with cancellation token
tokio::spawn(async move {
......@@ -330,6 +343,7 @@ impl Scheduler {
Self {
request_tx,
metrics_rx,
_cancel_guard: cancel_guard,
}
}
......@@ -360,13 +374,16 @@ async fn receive_requests(
}
if state.is_empty() {
// Fully idle - block until new request arrives
// Fully idle - block until new request arrives or shutdown
tokio::select! {
biased;
_ = cancel_token.cancelled() => {
return None;
}
Some(request) = request_rx.recv() => {
result = request_rx.recv() => {
let Some(request) = result else {
return None; // channel closed
};
state.receive(request);
return Some(());
}
......
......@@ -123,7 +123,7 @@ sglang_configs = {
marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
model="Qwen/Qwen3-0.6B",
env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
......
......@@ -152,7 +152,7 @@ trtllm_configs = {
)
],
env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
},
),
"disaggregated_router": TRTLLMConfig(
......
......@@ -204,7 +204,7 @@ vllm_configs = {
)
],
env={
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_llm::kv_router::scheduler=info",
"DYN_LOG": "dynamo_llm::kv_router::publisher=trace,dynamo_kv_router::scheduling::selector=info",
},
),
"agg-router-approx": VLLMConfig(
......@@ -235,7 +235,7 @@ vllm_configs = {
),
],
env={
"DYN_LOG": "dynamo_llm::kv_router::scheduler=info",
"DYN_LOG": "dynamo_kv_router::scheduling::selector=info",
},
),
"disaggregated": VLLMConfig(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment