scheduler.rs 24.6 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::discovery::RuntimeConfigWatch;
5
use crate::local_model::runtime_config::ModelRuntimeConfig;
6
use anyhow::Result;
7
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
8
use dynamo_runtime::traits::DistributedRuntimeProvider;
9
use rand::Rng;
10
use serde::{Deserialize, Serialize};
11
use std::collections::{HashMap, HashSet};
12
13
use std::sync::Arc;
use std::time::Duration;
14
15
#[cfg(feature = "bench")]
use std::time::Instant;
16

17
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::protocols::{DpRank, OverlapScores, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
21
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
22

23
use dynamo_tokens::SequenceHash;
24

25
26
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
27
28
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
29
30
31
32
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

33
34
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
35
    #[error("no endpoints available to route work")]
36
37
38
39
    NoEndpoints,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
40
41
42

    #[error("failed to initialize event publisher: {0}")]
    InitFailed(String),
43
44
}

45
46
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
47
    pub best_worker: WorkerWithDpRank,
48
    pub overlap_blocks: u32,
49
50
}

51
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
52
    pub maybe_request_id: Option<String>,
53
    pub token_seq: Option<Vec<SequenceHash>>,
54
    pub isl_tokens: usize,
55
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
56
57
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
58
59
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
60
61
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
62
63
    // LORA adapter name extracted from request.model field
    pub lora_name: Option<String>,
64
65
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
66
67
68
}

impl SchedulingRequest {
69
70
71
72
73
74
75
76
77
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
78
79
80
81
82
83
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
84
    slots: Arc<ActiveSequencesMultiWorker>,
85
86
87
88
}

impl KvScheduler {
    pub async fn start(
89
        component: Component,
90
        block_size: u32,
91
        workers_with_configs: RuntimeConfigWatch,
92
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
93
        replica_sync: bool,
94
        router_id: u64,
95
        worker_type: &'static str,
96
    ) -> Result<Self, KvSchedulerError> {
97
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
98

99
100
101
102
        // Get initial workers from watch receiver.
        // Caller must ensure at least one worker is present (via wait_for).
        let initial_workers: HashMap<WorkerId, ModelRuntimeConfig> =
            workers_with_configs.borrow().clone();
103

104
105
106
107
108
109
110
        let slots = Arc::new(
            ActiveSequencesMultiWorker::new(
                component.clone(),
                block_size as usize,
                initial_workers,
                replica_sync,
                router_id,
111
                worker_type,
112
113
114
115
            )
            .await
            .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?,
        );
116

117
        // Spawn background task to sync slots when the watch value changes.
Yan Ru Pei's avatar
Yan Ru Pei committed
118
        let slots_monitor = slots.clone();
119
        let mut monitor_rx = workers_with_configs.clone();
120
        let monitor_cancel_token = component.drt().child_token();
121
        tokio::spawn(async move {
122
            tracing::trace!("KvScheduler workers monitoring task started");
123
            let mut last_workers: HashMap<WorkerId, ModelRuntimeConfig> = HashMap::new();
124

125
            loop {
Yan Ru Pei's avatar
Yan Ru Pei committed
126
127
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
128
                        tracing::trace!("KvScheduler workers monitoring task shutting down");
129
130
                        break;
                    }
131
                    result = monitor_rx.changed() => {
132
133
134
135
136
                        if result.is_err() {
                            tracing::warn!("KvScheduler: config watch sender dropped, shutting down");
                            break;
                        }
                    }
137
138
                }

139
140
141
142
143
                let current_workers = monitor_rx.borrow_and_update().clone();

                if current_workers != last_workers {
                    slots_monitor.update_workers(current_workers.clone());
                    last_workers = current_workers;
Yan Ru Pei's avatar
Yan Ru Pei committed
144
145
146
147
148
                }
            }
        });

        let slots_clone = slots.clone();
149
        let scheduler_rx = workers_with_configs.clone();
Yan Ru Pei's avatar
Yan Ru Pei committed
150
151
152
153
154
155
156
157
158
159
160
161
162
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
163
164
165
                }

                // Wait for a new request
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

182
183
                // Read the current workers configuration from watch receiver
                let workers: HashMap<WorkerId, ModelRuntimeConfig> = scheduler_rx.borrow().clone();
Yan Ru Pei's avatar
Yan Ru Pei committed
184
185

                match selector.select_worker(&workers, &request, block_size) {
186
187
                    Ok(selection) => {
                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
188
                            best_worker: selection.worker,
189
190
191
192
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

193
194
195
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
196
                        }
197

Yan Ru Pei's avatar
Yan Ru Pei committed
198
199
200
201
202
203
204
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

205
206
207
208
209
210
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
211
                                None, // expected_output_tokens not available in scheduler loop
Yan Ru Pei's avatar
Yan Ru Pei committed
212
                                selection.worker,
213
                                request.lora_name.clone(),
214
215
216
                            )
                            .await
                        {
217
                            tracing::warn!("Failed to add request {request_id}: {e}");
218
                        }
219
220
221
222
223
224
225
226
227
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
228
229
230
231
                    }
                }
            }

232
            tracing::trace!("background endpoint subscriber shutting down");
233
234
        });

235
        Ok(KvScheduler { request_tx, slots })
236
237
    }

238
    #[allow(clippy::too_many_arguments)]
239
240
    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
241
        maybe_request_id: Option<String>,
242
        isl_tokens: usize,
243
        token_seq: Option<Vec<SequenceHash>>,
244
        overlaps: OverlapScores,
245
        router_config_override: Option<&RouterConfigOverride>,
246
        update_states: bool,
247
        lora_name: Option<String>,
Yan Ru Pei's avatar
Yan Ru Pei committed
248
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
249
250
251
        #[cfg(feature = "bench")]
        let start = Instant::now();

252
253
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
254
            maybe_request_id,
255
            token_seq,
256
            isl_tokens,
257
            overlaps,
258
259
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
260
            router_config_override: router_config_override.cloned(),
261
            update_states,
262
            lora_name,
263
            resp_tx: Some(resp_tx), // Wrap in Some()
264
        };
265

266
267
268
269
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
270
271
272
273

        #[cfg(feature = "bench")]
        let send_elapsed = start.elapsed();

274
        let response = resp_rx
275
276
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
277

278
279
280
281
282
283
284
285
286
287
        #[cfg(feature = "bench")]
        let total_elapsed = start.elapsed();
        #[cfg(feature = "bench")]
        tracing::info!(
            isl_tokens,
            send_us = send_elapsed.as_micros() as u64,
            total_us = total_elapsed.as_micros() as u64,
            "scheduler.schedule completed"
        );

Yan Ru Pei's avatar
Yan Ru Pei committed
288
        Ok(response.best_worker)
289
290
    }

291
    #[allow(clippy::too_many_arguments)]
292
293
294
    pub async fn add_request(
        &self,
        request_id: String,
295
        token_sequence: Option<Vec<SequenceHash>>,
296
297
        isl: usize,
        overlap: u32,
298
        expected_output_tokens: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
299
        worker: WorkerWithDpRank,
300
        lora_name: Option<String>,
301
302
    ) -> Result<(), SequenceError> {
        self.slots
303
304
305
306
307
308
309
            .add_request(
                request_id,
                token_sequence,
                isl,
                overlap,
                expected_output_tokens,
                worker,
310
                lora_name,
311
            )
312
            .await
313
314
    }

315
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
316
        self.slots
317
            .mark_prefill_completed(&request_id.to_string())
318
            .await
319
320
    }

321
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
322
        self.slots.free(&request_id.to_string()).await
323
    }
324

325
326
327
328
329
330
    /// Get the worker type for this scheduler ("prefill" or "decode").
    /// Used for Prometheus metric labeling.
    pub fn worker_type(&self) -> &'static str {
        self.slots.worker_type()
    }

331
332
333
334
335
336
337
338
339
340
    pub async fn add_output_block(
        &self,
        request_id: &str,
        decay_fraction: Option<f64>,
    ) -> Result<(), SequenceError> {
        self.slots
            .add_output_block(&request_id.to_string(), decay_fraction)
            .await
    }

341
342
    pub async fn get_potential_loads(
        &self,
343
        token_seq: Option<Vec<SequenceHash>>,
344
345
346
347
348
349
350
351
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
352
353
354
355
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
356
357
358

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
359
        for worker in workers {
360
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
361
362
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
363
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
364
                    .get(&worker)
365
366
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
367
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
368
369
370
371
372
            });
        }

        loads
    }
373
374
375
376
377

    /// Get active request counts grouped by LORA name
    pub fn get_active_lora_counts(&self) -> HashMap<String, usize> {
        self.slots.get_active_lora_counts()
    }
378
379
}

380
// Helper function for softmax sampling
381
382
383
384
385
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
386
387
388
389
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

390
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
391
392
393
394
395
396
397
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
398
            .filter(|&(_, &v)| v == min_logit)
399
400
401
            .map(|(k, _)| *k)
            .collect();

402
        return min_keys;
403
404
    }

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
421
422
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
445
            return vec![keys[i]];
446
447
448
449
        }
    }

    // Fallback to last key (shouldn't normally reach here)
450
    vec![keys[keys.len() - 1]]
451
452
}

453
// Default implementation matching the Python _cost_function
454
455
456
457
458
459
460
461
462
463
464
465
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
466
467
468
469

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
470
        workers: &HashMap<WorkerId, ModelRuntimeConfig>,
471
        request: &SchedulingRequest,
472
        block_size: u32,
473
474
475
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

476
        if workers.is_empty() {
477
478
479
            return Err(KvSchedulerError::NoEndpoints);
        }

480
481
482
483
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

484
485
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
486

487
        let mut worker_logits = HashMap::new();
488

489
490
491
492
493
494
495
        // Use override if provided, otherwise use default config
        let overlap_weight = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.overlap_score_weight)
            .unwrap_or(self.kv_router_config.overlap_score_weight);

Yan Ru Pei's avatar
Yan Ru Pei committed
496
497
498
499
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
500
            let data_parallel_size = config.data_parallel_size;
Yan Ru Pei's avatar
Yan Ru Pei committed
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530

            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
531
532
        }

533
        // Use softmax sampling to select worker(s)
534
535
536
537
538
539
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
540
541
542
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
543
        // If tree sizes are also equal, use random selection to avoid bias
544
545
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
546
            let tree_sizes: Vec<(usize, &WorkerWithDpRank)> = candidates
547
                .iter()
548
549
550
551
552
553
554
555
556
                .map(|w| (request.overlaps.tree_sizes.get(w).copied().unwrap_or(0), w))
                .collect();

            if tree_sizes.iter().all(|(s, _)| *s == tree_sizes[0].0) {
                let idx = rand::rng().random_range(0..candidates.len());
                candidates[idx]
            } else {
                *tree_sizes.iter().min_by_key(|(s, _)| *s).unwrap().1
            }
557
558
559
560
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
561
562
563
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
564

Yan Ru Pei's avatar
Yan Ru Pei committed
565
        // this is a runtime config set on a per worker basis, not per dp-rank
566
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
567
            .get(&best_worker.worker_id)
568
569
570
571
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

572
573
574
575
576
577
578
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

579
        tracing::info!(
580
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
581
582
            best_worker.worker_id,
            best_worker.dp_rank,
583
584
            best_logit,
            best_overlap,
585
            tree_size,
586
            total_blocks_info
587
        );
588
589

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
590
            worker: best_worker,
591
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
592
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
593
        })
594
595
    }
}
596
597
598
599
600

#[cfg(test)]
mod tests {
    use super::*;

601
602
603
604
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
605
606
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
607
608
609
610

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
611
612
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
613
614
615
616
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
617
        logits.insert(worker, -100.0); // Very negative value
618
619
620
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
621
622

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
623
        logits.insert(worker, 100.0); // Very positive value
624
625
626
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
627
628

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
629
        logits.insert(worker, 0.0); // Zero value
630
631
632
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
633
634
    }

635
636
    #[test]
    fn test_softmax_sample_zero_temperature() {
637
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
638
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
639
640
641
642
643
644
645
646
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
647

648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
679

680
681
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
682
683
684
685
686
687
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
688

689
        let result = softmax_sample(&logits, 0.0);
690
691
692
693
694
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
695
696
    }
}