Unverified Commit bddaaa26 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): pluggable scheduling policy for router queue [DYN-2454] (#7260)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 12785247
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
pub use dynamo_kv_router::scheduling::policy::RouterSchedulingPolicy;
pub use dynamo_kv_router::scheduling::{ pub use dynamo_kv_router::scheduling::{
KvSchedulerError, PotentialLoad, SchedulingRequest, SchedulingResponse, KvSchedulerError, PotentialLoad, SchedulingRequest, SchedulingResponse,
}; };
...@@ -100,12 +101,20 @@ impl KvScheduler { ...@@ -100,12 +101,20 @@ impl KvScheduler {
let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024); let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
let scheduler_cancel_token = component.drt().primary_token(); let scheduler_cancel_token = component.drt().primary_token();
let policy =
RouterSchedulingPolicy::new(kv_router_config.router_queue_policy, block_size as usize);
tracing::info!(
"Router queue policy: {}",
kv_router_config.router_queue_policy
);
let queue = Arc::new(SchedulerQueue::new( let queue = Arc::new(SchedulerQueue::new(
slots.clone(), slots.clone(),
workers_with_configs.clone(), workers_with_configs.clone(),
kv_router_config.router_queue_threshold, kv_router_config.router_queue_threshold,
block_size, block_size,
selector, selector,
policy,
)); ));
let queue_clone = queue.clone(); let queue_clone = queue.clone();
......
...@@ -213,7 +213,7 @@ impl KvManager { ...@@ -213,7 +213,7 @@ impl KvManager {
// If at max capacity, evict the oldest entry from inactive blocks // If at max capacity, evict the oldest entry from inactive blocks
if self.cache.is_at_capacity() { if self.cache.is_at_capacity() {
let Some(evicted) = self.cache.evict_inactive() else { let Some(evicted) = self.cache.evict_inactive() else {
return allocated; break;
}; };
tracing::trace!( tracing::trace!(
"Evicting block from inactive pool: {evicted:?}, dp_rank={}", "Evicting block from inactive pool: {evicted:?}, dp_rank={}",
......
...@@ -331,6 +331,16 @@ pub mod model { ...@@ -331,6 +331,16 @@ pub mod model {
} }
} }
/// KV Router configuration environment variables
pub mod router {
/// Queue threshold fraction for prefill token capacity.
/// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
pub const DYN_ROUTER_QUEUE_THRESHOLD: &str = "DYN_ROUTER_QUEUE_THRESHOLD";
/// Scheduling policy for the router queue ("fcfs" or "wspt").
pub const DYN_ROUTER_QUEUE_POLICY: &str = "DYN_ROUTER_QUEUE_POLICY";
}
/// Event Plane transport environment variables /// Event Plane transport environment variables
pub mod event_plane { pub mod event_plane {
/// Event transport selection: "zmq" or "nats". Default: "nats" /// Event transport selection: "zmq" or "nats". Default: "nats"
...@@ -481,6 +491,9 @@ mod tests { ...@@ -481,6 +491,9 @@ mod tests {
model::huggingface::HF_HUB_CACHE, model::huggingface::HF_HUB_CACHE,
model::huggingface::HF_HOME, model::huggingface::HF_HOME,
model::huggingface::HF_HUB_OFFLINE, model::huggingface::HF_HUB_OFFLINE,
// Router
router::DYN_ROUTER_QUEUE_THRESHOLD,
router::DYN_ROUTER_QUEUE_POLICY,
// Event Plane // Event Plane
event_plane::DYN_EVENT_PLANE, event_plane::DYN_EVENT_PLANE,
event_plane::DYN_EVENT_PLANE_CODEC, event_plane::DYN_EVENT_PLANE_CODEC,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment