scheduler.rs 25.3 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::discovery::RuntimeConfigsWithNotify;
5
use crate::local_model::runtime_config::ModelRuntimeConfig;
6
use anyhow::Result;
7
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
8
use dynamo_runtime::traits::DistributedRuntimeProvider;
Neelay Shah's avatar
Neelay Shah committed
9
use dynamo_runtime::traits::events::EventPublisher;
10
use rand::Rng;
11
use serde::{Deserialize, Serialize};
12
use std::collections::{HashMap, HashSet};
13
14
use std::sync::Arc;
use std::time::Duration;
15

16
17
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::indexer::OverlapScores;
Yan Ru Pei's avatar
Yan Ru Pei committed
21
use super::protocols::{DpRank, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
22
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
23

24
use crate::tokens::SequenceHash;
25

26
27
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
28
29
30
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
31
    pub isl_blocks: usize,
32
    pub overlap_blocks: u32,
33
34
}

35
36
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
37
38
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
39
40
41
42
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

43
44
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
45
    #[error("no endpoints available to route work")]
46
47
48
49
50
51
52
53
54
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

55
56
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
57
    pub best_worker: WorkerWithDpRank,
58
    pub overlap_blocks: u32,
59
60
}

61
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
62
    pub maybe_request_id: Option<String>,
63
    pub token_seq: Option<Vec<SequenceHash>>,
64
    pub isl_tokens: usize,
65
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
66
67
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
68
69
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
70
71
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
72
73
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
74
75
76
}

impl SchedulingRequest {
77
78
79
80
81
82
83
84
85
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
86
87
88
89
90
91
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
92
    slots: Arc<ActiveSequencesMultiWorker>,
93
94
95
96
}

impl KvScheduler {
    pub async fn start(
97
        component: Component,
98
        block_size: u32,
99
        workers_with_configs: Arc<RuntimeConfigsWithNotify>,
100
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
101
        replica_sync: bool,
102
        router_uuid: String,
103
    ) -> Result<Self, KvSchedulerError> {
104
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
105

106
107
        // Get initial workers from DashMap for slot initialization.
        // ModelManager guarantees at least one worker is present before KvRouter::new() is called.
108
        let initial_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_with_configs
109
            .configs
110
111
112
            .iter()
            .map(|r| (*r.key(), r.value().clone()))
            .collect();
113

114
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
Yan Ru Pei's avatar
Yan Ru Pei committed
115
            component.clone(),
116
            block_size as usize,
117
            initial_workers,
118
            replica_sync,
119
            router_uuid,
120
        ));
121

122
123
        // Spawn background task to sync slots with DashMap when notified of changes.
        // ModelManager's watcher updates the DashMap and notifies; we wait on notify here.
Yan Ru Pei's avatar
Yan Ru Pei committed
124
        let slots_monitor = slots.clone();
125
        let workers_monitor = workers_with_configs.clone();
126
        let monitor_cancel_token = component.drt().child_token();
127
        tokio::spawn(async move {
128
129
130
            tracing::trace!("KvScheduler workers monitoring task started");
            let mut last_workers: HashSet<WorkerId> = HashSet::new();

131
            loop {
132
                // Wait for notification or cancellation
Yan Ru Pei's avatar
Yan Ru Pei committed
133
134
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
135
                        tracing::trace!("KvScheduler workers monitoring task shutting down");
136
137
                        break;
                    }
138
                    _ = workers_monitor.notify.notified() => {}
139
140
                }

141
142
143
                // Get current workers from DashMap
                let current_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> =
                    workers_monitor
144
                        .configs
145
146
147
148
149
150
151
152
153
154
                        .iter()
                        .map(|r| (*r.key(), r.value().clone()))
                        .collect();
                let current_worker_ids: HashSet<WorkerId> =
                    current_workers.keys().copied().collect();

                // Only update slots if workers have changed
                if current_worker_ids != last_workers {
                    slots_monitor.update_workers(current_workers);
                    last_workers = current_worker_ids;
Yan Ru Pei's avatar
Yan Ru Pei committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
                }
            }
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
        let ns_clone = component.namespace().clone();

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
175
176
177
                }

                // Wait for a new request
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

194
195
                // Read the current workers configuration from DashMap
                let workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_scheduler
196
                    .configs
197
198
199
                    .iter()
                    .map(|r| (*r.key(), r.value().clone()))
                    .collect();
Yan Ru Pei's avatar
Yan Ru Pei committed
200
201

                match selector.select_worker(&workers, &request, block_size) {
202
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
203
                        let event = KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
204
205
                            worker_id: selection.worker.worker_id,
                            dp_rank: selection.worker.dp_rank,
206
207
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
208
209
210
                        };
                        if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
211
                        }
212
213

                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
214
                            best_worker: selection.worker,
215
216
217
218
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

219
220
221
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
222
                        }
223

Yan Ru Pei's avatar
Yan Ru Pei committed
224
225
226
227
228
229
230
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

231
232
233
234
235
236
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
237
                                None, // expected_output_tokens not available in scheduler loop
Yan Ru Pei's avatar
Yan Ru Pei committed
238
                                selection.worker,
239
240
241
                            )
                            .await
                        {
242
                            tracing::warn!("Failed to add request {request_id}: {e}");
243
                        }
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
259
260
261
262
                    }
                }
            }

263
            tracing::trace!("background endpoint subscriber shutting down");
264
265
        });

266
        Ok(KvScheduler { request_tx, slots })
267
268
269
270
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
271
        maybe_request_id: Option<String>,
272
        isl_tokens: usize,
273
        token_seq: Option<Vec<SequenceHash>>,
274
        overlaps: OverlapScores,
275
        router_config_override: Option<&RouterConfigOverride>,
276
        update_states: bool,
Yan Ru Pei's avatar
Yan Ru Pei committed
277
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
278
279
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
280
            maybe_request_id,
281
            token_seq,
282
            isl_tokens,
283
            overlaps,
284
285
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
286
            router_config_override: router_config_override.cloned(),
287
            update_states,
288
            resp_tx: Some(resp_tx), // Wrap in Some()
289
        };
290

291
292
293
294
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
295
        let response = resp_rx
296
297
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
298

Yan Ru Pei's avatar
Yan Ru Pei committed
299
        Ok(response.best_worker)
300
301
    }

302
303
304
    pub async fn add_request(
        &self,
        request_id: String,
305
        token_sequence: Option<Vec<SequenceHash>>,
306
307
        isl: usize,
        overlap: u32,
308
        expected_output_tokens: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
309
        worker: WorkerWithDpRank,
310
311
    ) -> Result<(), SequenceError> {
        self.slots
312
313
314
315
316
317
318
319
            .add_request(
                request_id,
                token_sequence,
                isl,
                overlap,
                expected_output_tokens,
                worker,
            )
320
            .await
321
322
    }

323
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
324
        self.slots
325
            .mark_prefill_completed(&request_id.to_string())
326
            .await
327
328
    }

329
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
330
        self.slots.free(&request_id.to_string()).await
331
    }
332

333
334
335
336
337
338
339
340
341
342
    pub async fn add_output_block(
        &self,
        request_id: &str,
        decay_fraction: Option<f64>,
    ) -> Result<(), SequenceError> {
        self.slots
            .add_output_block(&request_id.to_string(), decay_fraction)
            .await
    }

343
344
    pub async fn get_potential_loads(
        &self,
345
        token_seq: Option<Vec<SequenceHash>>,
346
347
348
349
350
351
352
353
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
354
355
356
357
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
358
359
360

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
361
        for worker in workers {
362
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
363
364
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
365
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
366
                    .get(&worker)
367
368
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
369
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
370
371
372
373
374
            });
        }

        loads
    }
375
376
}

377
// Helper function for softmax sampling
378
379
380
381
382
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
383
384
385
386
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

387
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
388
389
390
391
392
393
394
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
395
            .filter(|&(_, &v)| v == min_logit)
396
397
398
            .map(|(k, _)| *k)
            .collect();

399
        return min_keys;
400
401
    }

402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
418
419
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
442
            return vec![keys[i]];
443
444
445
446
        }
    }

    // Fallback to last key (shouldn't normally reach here)
447
    vec![keys[keys.len() - 1]]
448
449
}

450
// Default implementation matching the Python _cost_function
451
452
453
454
455
456
457
458
459
460
461
462
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
463
464
465
466

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
467
        workers: &HashMap<WorkerId, Option<ModelRuntimeConfig>>,
468
        request: &SchedulingRequest,
469
        block_size: u32,
470
471
472
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

473
        if workers.is_empty() {
474
475
476
            return Err(KvSchedulerError::NoEndpoints);
        }

477
478
479
480
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

481
482
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
483

484
        let mut worker_logits = HashMap::new();
485

Yan Ru Pei's avatar
Yan Ru Pei committed
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
            // Get data_parallel_size from runtime config
            // data_parallel_size defaults to 1 in ModelRuntimeConfig
            let data_parallel_size = config.as_ref().map(|c| c.data_parallel_size).unwrap_or(1); // Fallback if config is None

            // Iterate over all dp_ranks for this worker
            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Use override if provided, otherwise use default config
                let overlap_weight = request
                    .router_config_override
                    .as_ref()
                    .and_then(|cfg| cfg.overlap_score_weight)
                    .unwrap_or(self.kv_router_config.overlap_score_weight);

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
531
532
        }

533
        // Use softmax sampling to select worker(s)
534
535
536
537
538
539
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
        // If tree sizes are also equal, min_by_key uses HashMap iteration order (pseudo-random)
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
            *candidates
                .iter()
                .min_by_key(|worker| {
                    request
                        .overlaps
                        .tree_sizes
                        .get(worker)
                        .copied()
                        .unwrap_or(0)
                })
                .expect("candidates should not be empty")
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
561
562
563
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
564

Yan Ru Pei's avatar
Yan Ru Pei committed
565
        // this is a runtime config set on a per worker basis, not per dp-rank
566
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
567
            .get(&best_worker.worker_id)
568
569
570
571
572
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

573
574
575
576
577
578
579
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

580
        tracing::info!(
581
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
582
583
            best_worker.worker_id,
            best_worker.dp_rank,
584
585
            best_logit,
            best_overlap,
586
            tree_size,
587
            total_blocks_info
588
        );
589
590

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
591
            worker: best_worker,
592
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
593
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
594
        })
595
596
    }
}
597
598
599
600
601

#[cfg(test)]
mod tests {
    use super::*;

602
603
604
605
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
606
607
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
608
609
610
611

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
612
613
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
614
615
616
617
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
618
        logits.insert(worker, -100.0); // Very negative value
619
620
621
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
622
623

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
624
        logits.insert(worker, 100.0); // Very positive value
625
626
627
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
628
629

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
630
        logits.insert(worker, 0.0); // Zero value
631
632
633
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
634
635
    }

636
637
    #[test]
    fn test_softmax_sample_zero_temperature() {
638
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
639
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
640
641
642
643
644
645
646
647
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
648

649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
680

681
682
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
683
684
685
686
687
688
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
689

690
        let result = softmax_sample(&logits, 0.0);
691
692
693
694
695
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
696
697
    }
}