Unverified Commit 2af062ec authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(kv-router): correct decode-worker scoring for unregistered workers in selector (#7919)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 189d2178
...@@ -256,7 +256,8 @@ class KvRouterArgGroup(ArgGroup): ...@@ -256,7 +256,8 @@ class KvRouterArgGroup(ArgGroup):
help=( help=(
"KV Router: Queue threshold fraction for prefill token capacity. " "KV Router: Queue threshold fraction for prefill token capacity. "
"Requests are queued if all workers exceed this fraction of " "Requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Must be > 0." "max_num_batched_tokens. Must be >= 0. Use 0.0 for maximum "
"queueing sensitivity (queue as soon as any tokens are active)."
), ),
arg_type=float, arg_type=float,
) )
......
...@@ -187,7 +187,7 @@ pub struct KvRouterConfig { ...@@ -187,7 +187,7 @@ pub struct KvRouterConfig {
/// Queue threshold fraction for prefill token capacity. /// Queue threshold fraction for prefill token capacity.
/// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens. /// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
/// If None, queueing is disabled and all requests go directly to ready. /// If None, queueing is disabled and all requests go directly to ready.
/// Default: 2.0. Must be > 0. /// Default: 4.0. Must be >= 0. Use 0.0 for maximum queueing sensitivity.
#[validate(range(min = 0.0))] #[validate(range(min = 0.0))]
pub router_queue_threshold: Option<f64>, pub router_queue_threshold: Option<f64>,
......
...@@ -145,7 +145,12 @@ impl<C: WorkerConfigLike> WorkerSelector<C> for DefaultWorkerSelector { ...@@ -145,7 +145,12 @@ impl<C: WorkerConfigLike> WorkerSelector<C> for DefaultWorkerSelector {
let get_score = |worker: WorkerWithDpRank| -> f64 { let get_score = |worker: WorkerWithDpRank| -> f64 {
let overlap = *overlaps.get(&worker).unwrap_or(&0); let overlap = *overlaps.get(&worker).unwrap_or(&0);
let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl); // Use 0 for unregistered decode workers (track_prefill_tokens=false)
// to match registered idle workers; use isl otherwise.
let default_prefill_token = if request.track_prefill_tokens { isl } else { 0 };
let prefill_token = *prefill_tokens
.get(&worker)
.unwrap_or(&default_prefill_token);
let potential_prefill_block = (prefill_token as f64) / (block_size as f64); let potential_prefill_block = (prefill_token as f64) / (block_size as f64);
let decode_block = *decode_blocks let decode_block = *decode_blocks
.get(&worker) .get(&worker)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment