scheduler.rs 25.6 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::discovery::RuntimeConfigsWithNotify;
5
use crate::local_model::runtime_config::ModelRuntimeConfig;
6
use anyhow::Result;
7
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
8
use dynamo_runtime::traits::DistributedRuntimeProvider;
9
use dynamo_runtime::transports::event_plane::EventPublisher;
10
use rand::Rng;
11
use serde::{Deserialize, Serialize};
12
use std::collections::{HashMap, HashSet};
13
14
use std::sync::Arc;
use std::time::Duration;
15

16
17
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::indexer::OverlapScores;
Yan Ru Pei's avatar
Yan Ru Pei committed
21
use super::protocols::{DpRank, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
22
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
23

24
use dynamo_tokens::SequenceHash;
25

26
27
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
28
29
30
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
31
    pub isl_blocks: usize,
32
    pub overlap_blocks: u32,
33
34
}

35
36
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
37
38
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
39
40
41
42
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

43
44
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
45
    #[error("no endpoints available to route work")]
46
47
48
49
50
51
52
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
53
54
55

    #[error("failed to initialize event publisher: {0}")]
    InitFailed(String),
56
57
}

58
59
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
60
    pub best_worker: WorkerWithDpRank,
61
    pub overlap_blocks: u32,
62
63
}

64
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
65
    pub maybe_request_id: Option<String>,
66
    pub token_seq: Option<Vec<SequenceHash>>,
67
    pub isl_tokens: usize,
68
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
69
70
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
71
72
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
73
74
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
75
76
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
77
78
79
}

impl SchedulingRequest {
80
81
82
83
84
85
86
87
88
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
89
90
91
92
93
94
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
95
    slots: Arc<ActiveSequencesMultiWorker>,
96
97
98
99
}

impl KvScheduler {
    pub async fn start(
100
        component: Component,
101
        block_size: u32,
102
        workers_with_configs: Arc<RuntimeConfigsWithNotify>,
103
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
104
        replica_sync: bool,
105
        router_id: u64,
106
    ) -> Result<Self, KvSchedulerError> {
107
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
108

109
110
        // Get initial workers from DashMap for slot initialization.
        // ModelManager guarantees at least one worker is present before KvRouter::new() is called.
111
        let initial_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_with_configs
112
            .configs
113
114
115
            .iter()
            .map(|r| (*r.key(), r.value().clone()))
            .collect();
116

117
118
119
120
121
122
123
124
125
126
127
        let slots = Arc::new(
            ActiveSequencesMultiWorker::new(
                component.clone(),
                block_size as usize,
                initial_workers,
                replica_sync,
                router_id,
            )
            .await
            .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?,
        );
128

129
130
        // Spawn background task to sync slots with DashMap when notified of changes.
        // ModelManager's watcher updates the DashMap and notifies; we wait on notify here.
Yan Ru Pei's avatar
Yan Ru Pei committed
131
        let slots_monitor = slots.clone();
132
        let workers_monitor = workers_with_configs.clone();
133
        let monitor_cancel_token = component.drt().child_token();
134
        tokio::spawn(async move {
135
136
137
            tracing::trace!("KvScheduler workers monitoring task started");
            let mut last_workers: HashSet<WorkerId> = HashSet::new();

138
            loop {
139
                // Wait for notification or cancellation
Yan Ru Pei's avatar
Yan Ru Pei committed
140
141
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
142
                        tracing::trace!("KvScheduler workers monitoring task shutting down");
143
144
                        break;
                    }
145
                    _ = workers_monitor.notify.notified() => {}
146
147
                }

148
149
150
                // Get current workers from DashMap
                let current_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> =
                    workers_monitor
151
                        .configs
152
153
154
155
156
157
158
159
160
161
                        .iter()
                        .map(|r| (*r.key(), r.value().clone()))
                        .collect();
                let current_worker_ids: HashSet<WorkerId> =
                    current_workers.keys().copied().collect();

                // Only update slots if workers have changed
                if current_worker_ids != last_workers {
                    slots_monitor.update_workers(current_workers);
                    last_workers = current_worker_ids;
Yan Ru Pei's avatar
Yan Ru Pei committed
162
163
164
165
166
167
168
169
                }
            }
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
170
171
172
173
        let hit_rate_publisher =
            EventPublisher::for_namespace(component.namespace(), KV_HIT_RATE_SUBJECT)
                .await
                .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?;
Yan Ru Pei's avatar
Yan Ru Pei committed
174
175
176
177
178
179
180
181
182
183
184

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
185
186
187
                }

                // Wait for a new request
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

204
205
                // Read the current workers configuration from DashMap
                let workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_scheduler
206
                    .configs
207
208
209
                    .iter()
                    .map(|r| (*r.key(), r.value().clone()))
                    .collect();
Yan Ru Pei's avatar
Yan Ru Pei committed
210
211

                match selector.select_worker(&workers, &request, block_size) {
212
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
213
                        let event = KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
214
215
                            worker_id: selection.worker.worker_id,
                            dp_rank: selection.worker.dp_rank,
216
217
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
218
                        };
219
                        if let Err(e) = hit_rate_publisher.publish(&event).await {
Yan Ru Pei's avatar
Yan Ru Pei committed
220
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
221
                        }
222
223

                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
224
                            best_worker: selection.worker,
225
226
227
228
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

229
230
231
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
232
                        }
233

Yan Ru Pei's avatar
Yan Ru Pei committed
234
235
236
237
238
239
240
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

241
242
243
244
245
246
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
247
                                None, // expected_output_tokens not available in scheduler loop
Yan Ru Pei's avatar
Yan Ru Pei committed
248
                                selection.worker,
249
250
251
                            )
                            .await
                        {
252
                            tracing::warn!("Failed to add request {request_id}: {e}");
253
                        }
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
269
270
271
272
                    }
                }
            }

273
            tracing::trace!("background endpoint subscriber shutting down");
274
275
        });

276
        Ok(KvScheduler { request_tx, slots })
277
278
279
280
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
281
        maybe_request_id: Option<String>,
282
        isl_tokens: usize,
283
        token_seq: Option<Vec<SequenceHash>>,
284
        overlaps: OverlapScores,
285
        router_config_override: Option<&RouterConfigOverride>,
286
        update_states: bool,
Yan Ru Pei's avatar
Yan Ru Pei committed
287
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
288
289
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
290
            maybe_request_id,
291
            token_seq,
292
            isl_tokens,
293
            overlaps,
294
295
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
296
            router_config_override: router_config_override.cloned(),
297
            update_states,
298
            resp_tx: Some(resp_tx), // Wrap in Some()
299
        };
300

301
302
303
304
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
305
        let response = resp_rx
306
307
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
308

Yan Ru Pei's avatar
Yan Ru Pei committed
309
        Ok(response.best_worker)
310
311
    }

312
313
314
    pub async fn add_request(
        &self,
        request_id: String,
315
        token_sequence: Option<Vec<SequenceHash>>,
316
317
        isl: usize,
        overlap: u32,
318
        expected_output_tokens: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
319
        worker: WorkerWithDpRank,
320
321
    ) -> Result<(), SequenceError> {
        self.slots
322
323
324
325
326
327
328
329
            .add_request(
                request_id,
                token_sequence,
                isl,
                overlap,
                expected_output_tokens,
                worker,
            )
330
            .await
331
332
    }

333
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
334
        self.slots
335
            .mark_prefill_completed(&request_id.to_string())
336
            .await
337
338
    }

339
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
340
        self.slots.free(&request_id.to_string()).await
341
    }
342

343
344
345
346
347
348
349
350
351
352
    pub async fn add_output_block(
        &self,
        request_id: &str,
        decay_fraction: Option<f64>,
    ) -> Result<(), SequenceError> {
        self.slots
            .add_output_block(&request_id.to_string(), decay_fraction)
            .await
    }

353
354
    pub async fn get_potential_loads(
        &self,
355
        token_seq: Option<Vec<SequenceHash>>,
356
357
358
359
360
361
362
363
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
364
365
366
367
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
368
369
370

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
371
        for worker in workers {
372
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
373
374
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
375
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
376
                    .get(&worker)
377
378
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
379
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
380
381
382
383
384
            });
        }

        loads
    }
385
386
}

387
// Helper function for softmax sampling
388
389
390
391
392
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
393
394
395
396
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

397
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
398
399
400
401
402
403
404
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
405
            .filter(|&(_, &v)| v == min_logit)
406
407
408
            .map(|(k, _)| *k)
            .collect();

409
        return min_keys;
410
411
    }

412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
428
429
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
452
            return vec![keys[i]];
453
454
455
456
        }
    }

    // Fallback to last key (shouldn't normally reach here)
457
    vec![keys[keys.len() - 1]]
458
459
}

460
// Default implementation matching the Python _cost_function
461
462
463
464
465
466
467
468
469
470
471
472
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
473
474
475
476

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
477
        workers: &HashMap<WorkerId, Option<ModelRuntimeConfig>>,
478
        request: &SchedulingRequest,
479
        block_size: u32,
480
481
482
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

483
        if workers.is_empty() {
484
485
486
            return Err(KvSchedulerError::NoEndpoints);
        }

487
488
489
490
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

491
492
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
493

494
        let mut worker_logits = HashMap::new();
495

496
497
498
499
500
501
502
        // Use override if provided, otherwise use default config
        let overlap_weight = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.overlap_score_weight)
            .unwrap_or(self.kv_router_config.overlap_score_weight);

Yan Ru Pei's avatar
Yan Ru Pei committed
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
            // Get data_parallel_size from runtime config
            // data_parallel_size defaults to 1 in ModelRuntimeConfig
            let data_parallel_size = config.as_ref().map(|c| c.data_parallel_size).unwrap_or(1); // Fallback if config is None

            // Iterate over all dp_ranks for this worker
            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
541
542
        }

543
        // Use softmax sampling to select worker(s)
544
545
546
547
548
549
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
        // If tree sizes are also equal, min_by_key uses HashMap iteration order (pseudo-random)
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
            *candidates
                .iter()
                .min_by_key(|worker| {
                    request
                        .overlaps
                        .tree_sizes
                        .get(worker)
                        .copied()
                        .unwrap_or(0)
                })
                .expect("candidates should not be empty")
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
571
572
573
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
574

Yan Ru Pei's avatar
Yan Ru Pei committed
575
        // this is a runtime config set on a per worker basis, not per dp-rank
576
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
577
            .get(&best_worker.worker_id)
578
579
580
581
582
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

583
584
585
586
587
588
589
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

590
        tracing::info!(
591
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
592
593
            best_worker.worker_id,
            best_worker.dp_rank,
594
595
            best_logit,
            best_overlap,
596
            tree_size,
597
            total_blocks_info
598
        );
599
600

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
601
            worker: best_worker,
602
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
603
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
604
        })
605
606
    }
}
607
608
609
610
611

#[cfg(test)]
mod tests {
    use super::*;

612
613
614
615
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
616
617
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
618
619
620
621

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
622
623
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
624
625
626
627
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
628
        logits.insert(worker, -100.0); // Very negative value
629
630
631
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
632
633

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
634
        logits.insert(worker, 100.0); // Very positive value
635
636
637
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
638
639

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
640
        logits.insert(worker, 0.0); // Zero value
641
642
643
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
644
645
    }

646
647
    #[test]
    fn test_softmax_sample_zero_temperature() {
648
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
649
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
650
651
652
653
654
655
656
657
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
658

659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
690

691
692
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
693
694
695
696
697
698
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
699

700
        let result = softmax_sample(&logits, 0.0);
701
702
703
704
705
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
706
707
    }
}