scheduler.rs 26.8 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::local_model::runtime_config::{DisaggregatedEndpoint, ModelRuntimeConfig};
5
use anyhow::Result;
6
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
7
use dynamo_runtime::traits::DistributedRuntimeProvider;
Neelay Shah's avatar
Neelay Shah committed
8
use dynamo_runtime::traits::events::EventPublisher;
9
use rand::Rng;
10
use serde::{Deserialize, Serialize};
11
use std::collections::{HashMap, HashSet};
12
13
use std::sync::Arc;
use std::time::Duration;
Yan Ru Pei's avatar
Yan Ru Pei committed
14
use tokio::sync::{RwLock, watch};
15

16
17
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::indexer::OverlapScores;
Yan Ru Pei's avatar
Yan Ru Pei committed
21
use super::protocols::{DpRank, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
22
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
23

24
use crate::tokens::SequenceHash;
25

26
27
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
28
29
30
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
31
    pub isl_blocks: usize,
32
    pub overlap_blocks: u32,
33
34
}

35
36
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
37
38
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
39
40
41
42
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

43
44
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
45
    #[error("no endpoints available to route work")]
46
47
48
49
50
51
52
53
54
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

55
56
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
57
    pub best_worker: WorkerWithDpRank,
58
    pub overlap_blocks: u32,
59
60
}

61
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
62
    pub maybe_request_id: Option<String>,
63
    pub token_seq: Option<Vec<SequenceHash>>,
64
    pub isl_tokens: usize,
65
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
66
67
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
68
69
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
70
71
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
72
73
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
74
75
76
}

impl SchedulingRequest {
77
78
79
80
81
82
83
84
85
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
86
87
88
89
90
91
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
92
    slots: Arc<ActiveSequencesMultiWorker>,
93
94
    /// Worker runtime configs for looking up disaggregated endpoints
    workers_with_configs: Arc<RwLock<HashMap<WorkerId, Option<ModelRuntimeConfig>>>>,
95
96
97
98
}

impl KvScheduler {
    pub async fn start(
99
        component: Component,
100
        block_size: u32,
101
        instance_ids_rx: watch::Receiver<Vec<u64>>,
Yan Ru Pei's avatar
Yan Ru Pei committed
102
        runtime_configs_rx: watch::Receiver<HashMap<WorkerId, ModelRuntimeConfig>>,
103
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
104
        replica_sync: bool,
105
        router_uuid: String,
106
    ) -> Result<Self, KvSchedulerError> {
107
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
108
        let instance_ids: Vec<u64> = instance_ids_rx.borrow().clone();
Yan Ru Pei's avatar
Yan Ru Pei committed
109
110
        let runtime_configs: HashMap<WorkerId, ModelRuntimeConfig> =
            runtime_configs_rx.borrow().clone();
111

Yan Ru Pei's avatar
Yan Ru Pei committed
112
        // Create shared workers_with_configs wrapped in Arc<RwLock>
Yan Ru Pei's avatar
Yan Ru Pei committed
113
        let workers_with_configs: Arc<RwLock<HashMap<WorkerId, Option<ModelRuntimeConfig>>>> = {
Yan Ru Pei's avatar
Yan Ru Pei committed
114
            let mut initial_map = HashMap::new();
115
116
            for worker_id in &instance_ids {
                let config = runtime_configs.get(worker_id).cloned();
Yan Ru Pei's avatar
Yan Ru Pei committed
117
118
                if config.is_some() {
                    tracing::info!("Runtime config found for worker_id: {}", worker_id);
119
                }
120
                initial_map.insert(*worker_id, config);
121
            }
Yan Ru Pei's avatar
Yan Ru Pei committed
122
123
            Arc::new(RwLock::new(initial_map))
        };
124

125
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
Yan Ru Pei's avatar
Yan Ru Pei committed
126
            component.clone(),
127
            block_size as usize,
Yan Ru Pei's avatar
Yan Ru Pei committed
128
            workers_with_configs.read().await.clone(), // this includes dp_size info
129
            replica_sync,
130
            router_uuid,
131
        ));
132

Yan Ru Pei's avatar
Yan Ru Pei committed
133
134
135
        // Spawn background task to monitor and update workers_with_configs
        let workers_monitor = workers_with_configs.clone();
        let slots_monitor = slots.clone();
136
        let mut instance_ids_monitor_rx = instance_ids_rx.clone();
Yan Ru Pei's avatar
Yan Ru Pei committed
137
        let mut configs_monitor_rx = runtime_configs_rx.clone();
138
        let monitor_cancel_token = component.drt().child_token();
139
        tokio::spawn(async move {
Yan Ru Pei's avatar
Yan Ru Pei committed
140
            tracing::trace!("workers monitoring task started");
141
            loop {
Yan Ru Pei's avatar
Yan Ru Pei committed
142
143
144
145
                // Wait for either instances or configs to change
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
                        tracing::trace!("workers monitoring task shutting down");
146
147
                        break;
                    }
148
                    result = instance_ids_monitor_rx.changed() => {
Yan Ru Pei's avatar
Yan Ru Pei committed
149
                        if result.is_err() {
150
                            tracing::warn!("instance IDs watch sender shutdown in monitor");
Yan Ru Pei's avatar
Yan Ru Pei committed
151
152
                            break;
                        }
153
                    }
Yan Ru Pei's avatar
Yan Ru Pei committed
154
155
156
157
158
                    result = configs_monitor_rx.changed() => {
                        if result.is_err() {
                            tracing::warn!("runtime configs watch sender shutdown in monitor");
                            break;
                        }
159
160
161
                    }
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
162
                // Get the latest values from both channels
163
                let new_instance_ids = instance_ids_monitor_rx.borrow_and_update().clone();
Yan Ru Pei's avatar
Yan Ru Pei committed
164
165
                let new_configs = configs_monitor_rx.borrow_and_update().clone();

Yan Ru Pei's avatar
Yan Ru Pei committed
166
167
                // Build the new workers_with_configs map
                let mut new_workers_with_configs = HashMap::new();
168
169
                for worker_id in &new_instance_ids {
                    let config = new_configs.get(worker_id).cloned();
Yan Ru Pei's avatar
Yan Ru Pei committed
170
171
                    if config.is_some() {
                        tracing::info!("Runtime config found for worker_id: {}", worker_id);
172
                    }
173
                    new_workers_with_configs.insert(*worker_id, config);
Yan Ru Pei's avatar
Yan Ru Pei committed
174
                }
Yan Ru Pei's avatar
Yan Ru Pei committed
175
176
177
178
179
180
181

                // Update workers when instances change
                slots_monitor.update_workers(new_workers_with_configs.clone());

                // Update the shared workers_with_configs
                let mut workers_map = workers_monitor.write().await;
                *workers_map = new_workers_with_configs;
Yan Ru Pei's avatar
Yan Ru Pei committed
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
                tracing::trace!(
                    "Updated workers_with_configs with {} workers",
                    workers_map.len()
                );
            }
            tracing::trace!("workers monitoring task shutting down");
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
        let ns_clone = component.namespace().clone();

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
206
207
208
                }

                // Wait for a new request
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

Yan Ru Pei's avatar
Yan Ru Pei committed
225
226
227
228
                // Read the current workers configuration
                let workers = workers_scheduler.read().await.clone();

                match selector.select_worker(&workers, &request, block_size) {
229
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
230
                        let event = KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
231
232
                            worker_id: selection.worker.worker_id,
                            dp_rank: selection.worker.dp_rank,
233
234
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
235
236
237
                        };
                        if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
238
                        }
239
240

                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
241
                            best_worker: selection.worker,
242
243
244
245
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

246
247
248
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
249
                        }
250

Yan Ru Pei's avatar
Yan Ru Pei committed
251
252
253
254
255
256
257
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

258
259
260
261
262
263
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
264
                                selection.worker,
265
266
267
                            )
                            .await
                        {
268
                            tracing::warn!("Failed to add request {request_id}: {e}");
269
                        }
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
285
286
287
288
                    }
                }
            }

289
            tracing::trace!("background endpoint subscriber shutting down");
290
291
        });

292
293
294
295
296
        Ok(KvScheduler {
            request_tx,
            slots,
            workers_with_configs,
        })
297
298
299
300
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
301
        maybe_request_id: Option<String>,
302
        isl_tokens: usize,
303
        token_seq: Option<Vec<SequenceHash>>,
304
        overlaps: OverlapScores,
305
        router_config_override: Option<&RouterConfigOverride>,
306
        update_states: bool,
Yan Ru Pei's avatar
Yan Ru Pei committed
307
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
308
309
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
310
            maybe_request_id,
311
            token_seq,
312
            isl_tokens,
313
            overlaps,
314
315
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
316
            router_config_override: router_config_override.cloned(),
317
            update_states,
318
            resp_tx: Some(resp_tx), // Wrap in Some()
319
        };
320

321
322
323
324
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
325
        let response = resp_rx
326
327
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
328

Yan Ru Pei's avatar
Yan Ru Pei committed
329
        Ok(response.best_worker)
330
331
    }

332
333
334
    pub async fn add_request(
        &self,
        request_id: String,
335
        token_sequence: Option<Vec<SequenceHash>>,
336
337
        isl: usize,
        overlap: u32,
Yan Ru Pei's avatar
Yan Ru Pei committed
338
        worker: WorkerWithDpRank,
339
340
    ) -> Result<(), SequenceError> {
        self.slots
Yan Ru Pei's avatar
Yan Ru Pei committed
341
            .add_request(request_id, token_sequence, isl, overlap, worker)
342
            .await
343
344
    }

345
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
346
        self.slots
347
            .mark_prefill_completed(&request_id.to_string())
348
            .await
349
350
    }

351
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
352
        self.slots.free(&request_id.to_string()).await
353
    }
354

355
356
357
358
359
360
361
362
363
364
365
    pub async fn get_disaggregated_endpoint(
        &self,
        worker_id: WorkerId,
    ) -> Option<DisaggregatedEndpoint> {
        let workers = self.workers_with_configs.read().await;
        workers
            .get(&worker_id)
            .and_then(|config| config.as_ref())
            .and_then(|config| config.disaggregated_endpoint.clone())
    }

366
367
    pub async fn get_potential_loads(
        &self,
368
        token_seq: Option<Vec<SequenceHash>>,
369
370
371
372
373
374
375
376
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
377
378
379
380
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
381
382
383

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
384
        for worker in workers {
385
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
386
387
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
388
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
389
                    .get(&worker)
390
391
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
392
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
393
394
395
396
397
            });
        }

        loads
    }
398
399
}

400
// Helper function for softmax sampling
401
402
403
404
405
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
406
407
408
409
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

410
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
411
412
413
414
415
416
417
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
418
            .filter(|&(_, &v)| v == min_logit)
419
420
421
            .map(|(k, _)| *k)
            .collect();

422
        return min_keys;
423
424
    }

425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
441
442
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
465
            return vec![keys[i]];
466
467
468
469
        }
    }

    // Fallback to last key (shouldn't normally reach here)
470
    vec![keys[keys.len() - 1]]
471
472
}

473
// Default implementation matching the Python _cost_function
474
475
476
477
478
479
480
481
482
483
484
485
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
486
487
488
489

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
490
        workers: &HashMap<WorkerId, Option<ModelRuntimeConfig>>,
491
        request: &SchedulingRequest,
492
        block_size: u32,
493
494
495
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

496
        if workers.is_empty() {
497
498
499
            return Err(KvSchedulerError::NoEndpoints);
        }

500
501
502
503
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

504
505
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
506

507
        let mut worker_logits = HashMap::new();
508
        let mut max_logit = f64::NEG_INFINITY;
509

Yan Ru Pei's avatar
Yan Ru Pei committed
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
            // Get data_parallel_size from runtime config
            // data_parallel_size defaults to 1 in ModelRuntimeConfig
            let data_parallel_size = config.as_ref().map(|c| c.data_parallel_size).unwrap_or(1); // Fallback if config is None

            // Iterate over all dp_ranks for this worker
            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Use override if provided, otherwise use default config
                let overlap_weight = request
                    .router_config_override
                    .as_ref()
                    .and_then(|cfg| cfg.overlap_score_weight)
                    .unwrap_or(self.kv_router_config.overlap_score_weight);

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;
                max_logit = max_logit.max(logit);

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
556
557
        }

558
        // Use softmax sampling to select worker(s)
559
560
561
562
563
564
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
        // If tree sizes are also equal, min_by_key uses HashMap iteration order (pseudo-random)
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
            *candidates
                .iter()
                .min_by_key(|worker| {
                    request
                        .overlaps
                        .tree_sizes
                        .get(worker)
                        .copied()
                        .unwrap_or(0)
                })
                .expect("candidates should not be empty")
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
586
587
588
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
589

Yan Ru Pei's avatar
Yan Ru Pei committed
590
        // this is a runtime config set on a per worker basis, not per dp-rank
591
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
592
            .get(&best_worker.worker_id)
593
594
595
596
597
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

598
599
600
601
602
603
604
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

605
        tracing::info!(
606
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
607
608
            best_worker.worker_id,
            best_worker.dp_rank,
609
610
            best_logit,
            best_overlap,
611
            tree_size,
612
            total_blocks_info
613
        );
614
615

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
616
            worker: best_worker,
617
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
618
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
619
        })
620
621
    }
}
622
623
624
625
626

#[cfg(test)]
mod tests {
    use super::*;

627
628
629
630
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
631
632
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
633
634
635
636

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
637
638
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
639
640
641
642
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
643
        logits.insert(worker, -100.0); // Very negative value
644
645
646
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
647
648

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
649
        logits.insert(worker, 100.0); // Very positive value
650
651
652
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
653
654

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
655
        logits.insert(worker, 0.0); // Zero value
656
657
658
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
659
660
    }

661
662
    #[test]
    fn test_softmax_sample_zero_temperature() {
663
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
664
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
665
666
667
668
669
670
671
672
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
673

674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
705

706
707
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
708
709
710
711
712
713
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
714

715
        let result = softmax_sample(&logits, 0.0);
716
717
718
719
720
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
721
722
    }
}