Unverified Commit e5850e23 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): add ActiveSequences benchmark and extract common bench utils (#6633)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent b302ec41
...@@ -3,10 +3,14 @@ ...@@ -3,10 +3,14 @@
//! Shared test utilities for radix tree tests. //! Shared test utilities for radix tree tests.
use std::future;
use crate::protocols::{ use crate::protocols::{
ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData, KvCacheRemoveData, KvCacheStoreData, ActiveLoad, ActiveSequenceEvent, ExternalSequenceBlockHash, KvCacheEvent, KvCacheEventData,
KvCacheStoredBlockData, LocalBlockHash, RouterEvent, WorkerId, KvCacheRemoveData, KvCacheStoreData, KvCacheStoredBlockData, LocalBlockHash, RouterEvent,
WorkerConfigLike, WorkerId, WorkerWithDpRank,
}; };
use crate::sequences::SequencePublisher;
/// Creates blocks with artificial hash mapping (hash * 100) for testing. /// Creates blocks with artificial hash mapping (hash * 100) for testing.
pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> { pub fn make_blocks(hashes: Vec<u64>) -> Vec<KvCacheStoredBlockData> {
...@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>) ...@@ -61,3 +65,51 @@ pub fn create_remove_event(worker_id: WorkerId, event_id: u64, hashes: Vec<u64>)
}, },
} }
} }
/// No-op [`SequencePublisher`] for tests and benchmarks that don't need event transport.
pub struct NoopSequencePublisher;
impl SequencePublisher for NoopSequencePublisher {
fn publish_event(
&self,
_event: &ActiveSequenceEvent,
) -> impl future::Future<Output = anyhow::Result<()>> + Send {
future::ready(Ok(()))
}
fn publish_load(&self, _load: ActiveLoad) {}
fn observe_load(&self, _: &WorkerWithDpRank, _: &str, _: usize, _: usize) {}
}
/// Minimal [`WorkerConfigLike`] for scheduler/queue tests and benchmarks.
#[derive(Debug, Clone)]
pub struct SimpleWorkerConfig {
pub data_parallel_size: u32,
pub max_num_batched_tokens: Option<u64>,
pub total_kv_blocks: Option<u64>,
}
impl Default for SimpleWorkerConfig {
fn default() -> Self {
Self {
data_parallel_size: 1,
max_num_batched_tokens: None,
total_kv_blocks: None,
}
}
}
impl WorkerConfigLike for SimpleWorkerConfig {
fn data_parallel_size(&self) -> u32 {
self.data_parallel_size
}
fn max_num_batched_tokens(&self) -> Option<u64> {
self.max_num_batched_tokens
}
fn total_kv_blocks(&self) -> Option<u64> {
self.total_kv_blocks
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
...@@ -27,6 +26,8 @@ use validator::Validate; ...@@ -27,6 +26,8 @@ use validator::Validate;
pub use dynamo_kv_router::approx; pub use dynamo_kv_router::approx;
pub use dynamo_kv_router::indexer; pub use dynamo_kv_router::indexer;
pub use dynamo_kv_router::protocols; pub use dynamo_kv_router::protocols;
pub use dynamo_kv_router::scheduling;
pub use dynamo_kv_router::selector;
pub mod cache_control; pub mod cache_control;
pub mod config; pub mod config;
...@@ -56,10 +57,10 @@ use crate::{ ...@@ -56,10 +57,10 @@ use crate::{
indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError}, indexer::{GetWorkersRequest, KvIndexer, KvIndexerInterface, KvRouterError},
protocols::{ protocols::{
BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest, BlockExtraInfo, DpRank, LocalBlockHash, OverlapScores, RouterEvent, RouterRequest,
RouterResponse, TokensWithHashes, WorkerId, WorkerSelectionResult, WorkerWithDpRank, RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
compute_block_hash_for_seq, compute_block_hash_for_seq,
}, },
scheduler::{KvScheduler, KvSchedulerError, PotentialLoad, SchedulingRequest}, scheduler::{KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest}, sequence::{SequenceError, SequenceRequest},
}, },
local_model::runtime_config::ModelRuntimeConfig, local_model::runtime_config::ModelRuntimeConfig,
...@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery ...@@ -118,15 +119,9 @@ pub fn router_discovery_query(namespace: String, component: String) -> Discovery
} }
} }
/// A trait that users can implement to define custom selection logic /// Concrete `WorkerSelector` bound to the runtime config type.
pub trait WorkerSelector { pub type WorkerSelector =
fn select_worker( dyn dynamo_kv_router::selector::WorkerSelector<ModelRuntimeConfig> + Send + Sync;
&self,
workers: &HashMap<protocols::WorkerId, ModelRuntimeConfig>,
request: &SchedulingRequest,
block_size: u32,
) -> Result<WorkerSelectionResult, KvSchedulerError>;
}
#[derive(Clone)] #[derive(Clone)]
pub enum Indexer { pub enum Indexer {
...@@ -297,7 +292,7 @@ impl KvRouter { ...@@ -297,7 +292,7 @@ impl KvRouter {
client: Client, client: Client,
mut workers_with_configs: RuntimeConfigWatch, mut workers_with_configs: RuntimeConfigWatch,
block_size: u32, block_size: u32,
selector: Option<Box<dyn WorkerSelector + Send + Sync>>, selector: Option<Box<WorkerSelector>>,
kv_router_config: Option<KvRouterConfig>, kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str, worker_type: &'static str,
) -> Result<Self> { ) -> Result<Self> {
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use derive_builder::Builder; pub use dynamo_kv_router::config::{KvRouterConfig, RouterConfigOverride};
use rand::Rng;
use serde::{Deserialize, Serialize};
use validator::{Validate, ValidationError};
use crate::kv_router::protocols::{compute_block_hash_for_seq, compute_seq_hash_for_block};
/// Override configuration for router settings that can be specified per-request
#[derive(Debug, Clone, Default, Builder, Serialize, Deserialize, Validate)]
pub struct RouterConfigOverride {
#[builder(default)]
pub overlap_score_weight: Option<f64>,
#[builder(default)]
#[validate(range(min = 0.0))]
pub router_temperature: Option<f64>,
#[builder(default)]
pub assume_kv_reuse: Option<bool>,
}
/// KV Router configuration parameters
#[derive(Debug, Clone, Copy, Serialize, Deserialize, Validate)]
#[validate(schema(function = "validate_kv_router_config"))]
pub struct KvRouterConfig {
#[validate(range(min = 0.0))]
pub overlap_score_weight: f64,
#[validate(range(min = 0.0))]
pub router_temperature: f64,
pub use_kv_events: bool,
/// **Deprecated:** Enable durable KV events using NATS JetStream instead of the default event plane.
/// This option will be removed in a future release. The event-plane subscriber
/// (local_indexer mode) is now the recommended path.
pub durable_kv_events: bool,
pub router_replica_sync: bool,
/// Whether to track active blocks in the router (default: true)
pub router_track_active_blocks: bool,
/// Whether to track output blocks during generation (default: false)
/// When enabled, the router adds placeholder blocks as tokens are generated
/// and applies fractional decay based on progress toward agent_hints.osl.
pub router_track_output_blocks: bool,
/// Whether to assume KV cache reuse when tracking active blocks (default: true).
/// When true, computes actual block hashes for sequence tracking.
/// When false, generates random hashes (assuming no KV cache reuse).
pub router_assume_kv_reuse: bool,
/// Threshold for triggering snapshots. If None, no snapshots will be performed.
#[validate(range(min = 1))]
pub router_snapshot_threshold: Option<u32>,
/// Whether to reset the router state on startup (default: false)
pub router_reset_states: bool,
/// TTL for blocks in seconds (only used when use_kv_events is false, default: 120.0)
#[validate(range(min = 0.0))]
pub router_ttl_secs: f64,
/// Maximum tree size before pruning (only used when use_kv_events is false, default: 2^20 = 1048576)
#[validate(range(min = 1))]
pub router_max_tree_size: usize,
/// Target size ratio after pruning (only used when use_kv_events is false, default: 0.8)
#[validate(range(min = 0.0, max = 1.0))]
pub router_prune_target_ratio: f64,
/// Queue threshold fraction for prefill token capacity.
/// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
/// If None (default), queueing is disabled and all requests go directly to ready.
/// Must be > 0.
#[validate(range(min = 0.0))]
pub router_queue_threshold: Option<f64>,
/// Number of event processing threads for the KV indexer.
/// When > 1, uses ConcurrentRadixTree with a thread pool instead of the
/// single-threaded RadixTree. Default: 4.
#[validate(range(min = 1))]
pub router_event_threads: u32,
/// Enable cache control (PIN with TTL) via the worker's cache_control service mesh endpoint.
/// When true, the router creates a cache_control client and honors nvext.cache_control on
/// requests, firing a pin_prefix call (with TTL) to the worker after generation completes.
/// When false (default), cache_control is ignored and no cache_control client is created.
pub router_enable_cache_control: bool,
}
impl Default for KvRouterConfig {
fn default() -> Self {
Self {
overlap_score_weight: 1.0,
router_temperature: 0.0,
use_kv_events: true,
durable_kv_events: false, // default to NATS Core (local indexer mode)
router_replica_sync: false,
router_track_active_blocks: true,
router_track_output_blocks: false,
router_assume_kv_reuse: true,
router_snapshot_threshold: Some(1000000),
router_reset_states: false,
router_ttl_secs: 120.0,
router_max_tree_size: 2usize.pow(20), // 2^20 = 1048576, matches PruneConfig::default()
router_prune_target_ratio: 0.8,
router_queue_threshold: None,
router_event_threads: 4,
router_enable_cache_control: false,
}
}
}
fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationError> {
if config.durable_kv_events {
tracing::warn!(
"--durable-kv-events is deprecated and will be removed in a future release. \
The event-plane subscriber (local_indexer mode) is now the recommended path."
);
}
if config.durable_kv_events && !config.use_kv_events {
return Err(ValidationError::new(
"durable_kv_events requires use_kv_events=true",
));
}
if config.router_track_output_blocks && !config.router_track_active_blocks {
return Err(ValidationError::new(
"router_track_output_blocks requires router_track_active_blocks=true",
));
}
Ok(())
}
impl KvRouterConfig {
/// Compute sequence hashes for active block tracking based on configuration.
///
/// Returns:
/// - `None` if `router_track_active_blocks` is false
/// - Random hashes if `router_track_active_blocks` is true but `router_assume_kv_reuse` is false
/// - Actual sequence hashes if both are true
pub fn compute_seq_hashes_for_tracking(
&self,
tokens: &[u32],
block_size: u32,
config_override: Option<&RouterConfigOverride>,
lora_name: Option<&str>,
) -> Option<Vec<u64>> {
if !self.router_track_active_blocks {
return None;
}
let num_blocks = tokens.len() / block_size as usize;
if num_blocks == 0 {
return Some(Vec::new());
}
let assume_kv_reuse = config_override
.and_then(|cfg| cfg.assume_kv_reuse)
.unwrap_or(self.router_assume_kv_reuse);
if assume_kv_reuse {
let block_hashes = compute_block_hash_for_seq(tokens, block_size, None, lora_name);
Some(compute_seq_hash_for_block(&block_hashes))
} else {
let mut rng = rand::rng();
Some((0..num_blocks).map(|_| rng.random::<u64>()).collect())
}
}
/// Check if KV event subscription should be started.
///
/// Returns false if:
/// - KV events are disabled (`use_kv_events=false`)
/// - Overlap scoring is disabled (`overlap_score_weight=0`)
///
/// When false, the router skips starting the KV event subscription entirely,
/// avoiding the need to query workers for their local indexer state.
pub fn should_subscribe_to_kv_events(&self) -> bool {
self.use_kv_events && self.overlap_score_weight > 0.0
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment