Unverified Commit 2af062ec authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(kv-router): correct decode-worker scoring for unregistered workers in selector (#7919)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 189d2178
......@@ -256,7 +256,8 @@ class KvRouterArgGroup(ArgGroup):
help=(
"KV Router: Queue threshold fraction for prefill token capacity. "
"Requests are queued if all workers exceed this fraction of "
"max_num_batched_tokens. Must be > 0."
"max_num_batched_tokens. Must be >= 0. Use 0.0 for maximum "
"queueing sensitivity (queue as soon as any tokens are active)."
),
arg_type=float,
)
......
......@@ -187,7 +187,7 @@ pub struct KvRouterConfig {
/// Queue threshold fraction for prefill token capacity.
/// When set, requests are queued if all workers exceed this fraction of max_num_batched_tokens.
/// If None, queueing is disabled and all requests go directly to ready.
/// Default: 2.0. Must be > 0.
/// Default: 4.0. Must be >= 0. Use 0.0 for maximum queueing sensitivity.
#[validate(range(min = 0.0))]
pub router_queue_threshold: Option<f64>,
......
......@@ -145,7 +145,12 @@ impl<C: WorkerConfigLike> WorkerSelector<C> for DefaultWorkerSelector {
let get_score = |worker: WorkerWithDpRank| -> f64 {
let overlap = *overlaps.get(&worker).unwrap_or(&0);
let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
// Use 0 for unregistered decode workers (track_prefill_tokens=false)
// to match registered idle workers; use isl otherwise.
let default_prefill_token = if request.track_prefill_tokens { isl } else { 0 };
let prefill_token = *prefill_tokens
.get(&worker)
.unwrap_or(&default_prefill_token);
let potential_prefill_block = (prefill_token as f64) / (block_size as f64);
let decode_block = *decode_blocks
.get(&worker)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment