scheduler.rs 25.1 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::local_model::runtime_config::ModelRuntimeConfig;
5
use anyhow::Result;
6
use dashmap::DashMap;
7
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
8
use dynamo_runtime::traits::DistributedRuntimeProvider;
Neelay Shah's avatar
Neelay Shah committed
9
use dynamo_runtime::traits::events::EventPublisher;
10
use rand::Rng;
11
use serde::{Deserialize, Serialize};
12
use std::collections::{HashMap, HashSet};
13
14
use std::sync::Arc;
use std::time::Duration;
15
use tokio::sync::watch;
16

17
18
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
19
use super::RouterConfigOverride;
20
use super::WorkerSelector;
21
use super::indexer::OverlapScores;
Yan Ru Pei's avatar
Yan Ru Pei committed
22
use super::protocols::{DpRank, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
23
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
24

25
use crate::tokens::SequenceHash;
26

27
28
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
29
30
31
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
32
    pub isl_blocks: usize,
33
    pub overlap_blocks: u32,
34
35
}

36
37
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
38
39
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
40
41
42
43
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

44
45
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
46
    #[error("no endpoints available to route work")]
47
48
49
50
51
52
53
54
55
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

56
57
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
58
    pub best_worker: WorkerWithDpRank,
59
    pub overlap_blocks: u32,
60
61
}

62
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
63
    pub maybe_request_id: Option<String>,
64
    pub token_seq: Option<Vec<SequenceHash>>,
65
    pub isl_tokens: usize,
66
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
67
68
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
69
70
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
71
72
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
73
74
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
75
76
77
}

impl SchedulingRequest {
78
79
80
81
82
83
84
85
86
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
87
88
89
90
91
92
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
93
    slots: Arc<ActiveSequencesMultiWorker>,
94
95
96
97
}

impl KvScheduler {
    pub async fn start(
98
        component: Component,
99
        block_size: u32,
100
        instance_ids_rx: watch::Receiver<Vec<u64>>,
101
        workers_with_configs: Arc<DashMap<WorkerId, Option<ModelRuntimeConfig>>>,
102
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
103
        replica_sync: bool,
104
        router_uuid: String,
105
    ) -> Result<Self, KvSchedulerError> {
106
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
107
108
109
110
111
112

        // Get initial workers from DashMap for slot initialization
        let initial_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_with_configs
            .iter()
            .map(|r| (*r.key(), r.value().clone()))
            .collect();
113

114
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
Yan Ru Pei's avatar
Yan Ru Pei committed
115
            component.clone(),
116
            block_size as usize,
117
            initial_workers,
118
            replica_sync,
119
            router_uuid,
120
        ));
121

122
        // Spawn background task to monitor workers_with_configs changes and update slots
Yan Ru Pei's avatar
Yan Ru Pei committed
123
        let slots_monitor = slots.clone();
124
        let workers_monitor = workers_with_configs.clone();
125
        let mut instance_ids_monitor_rx = instance_ids_rx.clone();
126
        let monitor_cancel_token = component.drt().child_token();
127
        tokio::spawn(async move {
128
129
130
            tracing::trace!("KvScheduler workers monitoring task started");
            let mut last_workers: HashSet<WorkerId> = HashSet::new();

131
            loop {
132
                // Wait for instance changes (ModelManager handles config updates to the DashMap)
Yan Ru Pei's avatar
Yan Ru Pei committed
133
134
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
135
                        tracing::trace!("KvScheduler workers monitoring task shutting down");
136
137
                        break;
                    }
138
                    result = instance_ids_monitor_rx.changed() => {
Yan Ru Pei's avatar
Yan Ru Pei committed
139
                        if result.is_err() {
140
                            tracing::warn!("instance IDs watch sender shutdown in KvScheduler monitor");
Yan Ru Pei's avatar
Yan Ru Pei committed
141
142
                            break;
                        }
143
144
145
                    }
                }

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
                // Get current workers from DashMap
                let current_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> =
                    workers_monitor
                        .iter()
                        .map(|r| (*r.key(), r.value().clone()))
                        .collect();
                let current_worker_ids: HashSet<WorkerId> =
                    current_workers.keys().copied().collect();

                // Only update slots if workers have changed
                if current_worker_ids != last_workers {
                    slots_monitor.update_workers(current_workers);
                    last_workers = current_worker_ids;
                    tracing::trace!(
                        "KvScheduler: Updated slots with {} workers",
                        last_workers.len()
                    );
Yan Ru Pei's avatar
Yan Ru Pei committed
163
164
                }
            }
165
            tracing::trace!("KvScheduler workers monitoring task shutting down");
Yan Ru Pei's avatar
Yan Ru Pei committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
        let ns_clone = component.namespace().clone();

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
184
185
186
                }

                // Wait for a new request
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

203
204
205
206
207
                // Read the current workers configuration from DashMap
                let workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_scheduler
                    .iter()
                    .map(|r| (*r.key(), r.value().clone()))
                    .collect();
Yan Ru Pei's avatar
Yan Ru Pei committed
208
209

                match selector.select_worker(&workers, &request, block_size) {
210
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
211
                        let event = KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
212
213
                            worker_id: selection.worker.worker_id,
                            dp_rank: selection.worker.dp_rank,
214
215
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
216
217
218
                        };
                        if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
219
                        }
220
221

                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
222
                            best_worker: selection.worker,
223
224
225
226
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

227
228
229
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
230
                        }
231

Yan Ru Pei's avatar
Yan Ru Pei committed
232
233
234
235
236
237
238
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

239
240
241
242
243
244
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
245
                                selection.worker,
246
247
248
                            )
                            .await
                        {
249
                            tracing::warn!("Failed to add request {request_id}: {e}");
250
                        }
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
266
267
268
269
                    }
                }
            }

270
            tracing::trace!("background endpoint subscriber shutting down");
271
272
        });

273
        Ok(KvScheduler { request_tx, slots })
274
275
276
277
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
278
        maybe_request_id: Option<String>,
279
        isl_tokens: usize,
280
        token_seq: Option<Vec<SequenceHash>>,
281
        overlaps: OverlapScores,
282
        router_config_override: Option<&RouterConfigOverride>,
283
        update_states: bool,
Yan Ru Pei's avatar
Yan Ru Pei committed
284
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
285
286
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
287
            maybe_request_id,
288
            token_seq,
289
            isl_tokens,
290
            overlaps,
291
292
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
293
            router_config_override: router_config_override.cloned(),
294
            update_states,
295
            resp_tx: Some(resp_tx), // Wrap in Some()
296
        };
297

298
299
300
301
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
302
        let response = resp_rx
303
304
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
305

Yan Ru Pei's avatar
Yan Ru Pei committed
306
        Ok(response.best_worker)
307
308
    }

309
310
311
    pub async fn add_request(
        &self,
        request_id: String,
312
        token_sequence: Option<Vec<SequenceHash>>,
313
314
        isl: usize,
        overlap: u32,
Yan Ru Pei's avatar
Yan Ru Pei committed
315
        worker: WorkerWithDpRank,
316
317
    ) -> Result<(), SequenceError> {
        self.slots
Yan Ru Pei's avatar
Yan Ru Pei committed
318
            .add_request(request_id, token_sequence, isl, overlap, worker)
319
            .await
320
321
    }

322
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
323
        self.slots
324
            .mark_prefill_completed(&request_id.to_string())
325
            .await
326
327
    }

328
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
329
        self.slots.free(&request_id.to_string()).await
330
    }
331
332
333

    pub async fn get_potential_loads(
        &self,
334
        token_seq: Option<Vec<SequenceHash>>,
335
336
337
338
339
340
341
342
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
343
344
345
346
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
347
348
349

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
350
        for worker in workers {
351
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
352
353
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
354
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
355
                    .get(&worker)
356
357
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
358
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
359
360
361
362
363
            });
        }

        loads
    }
364
365
}

366
// Helper function for softmax sampling
367
368
369
370
371
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
372
373
374
375
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

376
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
377
378
379
380
381
382
383
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
384
            .filter(|&(_, &v)| v == min_logit)
385
386
387
            .map(|(k, _)| *k)
            .collect();

388
        return min_keys;
389
390
    }

391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
407
408
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
431
            return vec![keys[i]];
432
433
434
435
        }
    }

    // Fallback to last key (shouldn't normally reach here)
436
    vec![keys[keys.len() - 1]]
437
438
}

439
// Default implementation matching the Python _cost_function
440
441
442
443
444
445
446
447
448
449
450
451
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
452
453
454
455

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
456
        workers: &HashMap<WorkerId, Option<ModelRuntimeConfig>>,
457
        request: &SchedulingRequest,
458
        block_size: u32,
459
460
461
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

462
        if workers.is_empty() {
463
464
465
            return Err(KvSchedulerError::NoEndpoints);
        }

466
467
468
469
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

470
471
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
472

473
        let mut worker_logits = HashMap::new();
474

Yan Ru Pei's avatar
Yan Ru Pei committed
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
            // Get data_parallel_size from runtime config
            // data_parallel_size defaults to 1 in ModelRuntimeConfig
            let data_parallel_size = config.as_ref().map(|c| c.data_parallel_size).unwrap_or(1); // Fallback if config is None

            // Iterate over all dp_ranks for this worker
            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Use override if provided, otherwise use default config
                let overlap_weight = request
                    .router_config_override
                    .as_ref()
                    .and_then(|cfg| cfg.overlap_score_weight)
                    .unwrap_or(self.kv_router_config.overlap_score_weight);

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
520
521
        }

522
        // Use softmax sampling to select worker(s)
523
524
525
526
527
528
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
        // If tree sizes are also equal, min_by_key uses HashMap iteration order (pseudo-random)
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
            *candidates
                .iter()
                .min_by_key(|worker| {
                    request
                        .overlaps
                        .tree_sizes
                        .get(worker)
                        .copied()
                        .unwrap_or(0)
                })
                .expect("candidates should not be empty")
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
550
551
552
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
553

Yan Ru Pei's avatar
Yan Ru Pei committed
554
        // this is a runtime config set on a per worker basis, not per dp-rank
555
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
556
            .get(&best_worker.worker_id)
557
558
559
560
561
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

562
563
564
565
566
567
568
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

569
        tracing::info!(
570
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
571
572
            best_worker.worker_id,
            best_worker.dp_rank,
573
574
            best_logit,
            best_overlap,
575
            tree_size,
576
            total_blocks_info
577
        );
578
579

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
580
            worker: best_worker,
581
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
582
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
583
        })
584
585
    }
}
586
587
588
589
590

#[cfg(test)]
mod tests {
    use super::*;

591
592
593
594
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
595
596
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
597
598
599
600

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
601
602
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
603
604
605
606
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
607
        logits.insert(worker, -100.0); // Very negative value
608
609
610
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
611
612

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
613
        logits.insert(worker, 100.0); // Very positive value
614
615
616
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
617
618

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
619
        logits.insert(worker, 0.0); // Zero value
620
621
622
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
623
624
    }

625
626
    #[test]
    fn test_softmax_sample_zero_temperature() {
627
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
628
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
629
630
631
632
633
634
635
636
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
637

638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
669

670
671
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
672
673
674
675
676
677
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
678

679
        let result = softmax_sample(&logits, 0.0);
680
681
682
683
684
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
685
686
    }
}