scheduler.rs 25.4 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::discovery::RuntimeConfigs;
5
use crate::local_model::runtime_config::ModelRuntimeConfig;
6
use anyhow::Result;
7
use dynamo_runtime::component::Component;
Yan Ru Pei's avatar
Yan Ru Pei committed
8
use dynamo_runtime::traits::DistributedRuntimeProvider;
9
use dynamo_runtime::transports::event_plane::EventPublisher;
10
use rand::Rng;
11
use serde::{Deserialize, Serialize};
12
use std::collections::{HashMap, HashSet};
13
14
use std::sync::Arc;
use std::time::Duration;
15

16
17
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::protocols::{DpRank, OverlapScores, WorkerId, WorkerSelectionResult, WorkerWithDpRank};
21
use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
22

23
use dynamo_tokens::SequenceHash;
24

25
26
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
27
28
29
    pub worker_id: WorkerId,
    #[serde(default)]
    pub dp_rank: DpRank,
30
    pub isl_blocks: usize,
31
    pub overlap_blocks: u32,
32
33
}

34
35
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
36
37
    pub worker_id: WorkerId,
    pub dp_rank: DpRank,
38
39
40
41
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

42
43
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
44
    #[error("no endpoints available to route work")]
45
46
47
48
    NoEndpoints,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
49
50
51

    #[error("failed to initialize event publisher: {0}")]
    InitFailed(String),
52
53
}

54
55
#[derive(Debug)]
pub struct SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
56
    pub best_worker: WorkerWithDpRank,
57
    pub overlap_blocks: u32,
58
59
}

60
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
61
    pub maybe_request_id: Option<String>,
62
    pub token_seq: Option<Vec<SequenceHash>>,
63
    pub isl_tokens: usize,
64
    pub overlaps: OverlapScores,
Yan Ru Pei's avatar
Yan Ru Pei committed
65
66
    pub decode_blocks: HashMap<WorkerWithDpRank, usize>,
    pub prefill_tokens: HashMap<WorkerWithDpRank, usize>,
67
68
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
69
70
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
71
72
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
73
74
75
}

impl SchedulingRequest {
76
77
78
79
80
81
82
83
84
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
85
86
87
88
89
90
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
91
    slots: Arc<ActiveSequencesMultiWorker>,
92
93
94
95
}

impl KvScheduler {
    pub async fn start(
96
        component: Component,
97
        block_size: u32,
98
        workers_with_configs: Arc<RuntimeConfigs>,
99
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
100
        replica_sync: bool,
101
        router_id: u64,
102
    ) -> Result<Self, KvSchedulerError> {
103
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
104

105
        // Get initial workers from DashMap for slot initialization.
106
        // Caller must ensure at least one worker is present (via wait_for_some).
107
        let initial_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_with_configs
108
            .configs
109
110
111
            .iter()
            .map(|r| (*r.key(), r.value().clone()))
            .collect();
112

113
114
115
116
117
118
119
120
121
122
123
        let slots = Arc::new(
            ActiveSequencesMultiWorker::new(
                component.clone(),
                block_size as usize,
                initial_workers,
                replica_sync,
                router_id,
            )
            .await
            .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?,
        );
124

125
        // Spawn background task to sync slots with DashMap when notified of changes.
126
        // ModelManager's watcher updates the DashMap and notifies; we wait on watch receiver here.
Yan Ru Pei's avatar
Yan Ru Pei committed
127
        let slots_monitor = slots.clone();
128
129
130
        let subscriber = workers_with_configs.subscribe();
        let configs_monitor = subscriber.configs;
        let mut change_rx = subscriber.change_rx;
131
        let monitor_cancel_token = component.drt().child_token();
132
        tokio::spawn(async move {
133
134
135
            tracing::trace!("KvScheduler workers monitoring task started");
            let mut last_workers: HashSet<WorkerId> = HashSet::new();

136
            loop {
137
                // Wait for notification or cancellation
Yan Ru Pei's avatar
Yan Ru Pei committed
138
139
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
140
                        tracing::trace!("KvScheduler workers monitoring task shutting down");
141
142
                        break;
                    }
143
144
145
146
147
148
                    result = change_rx.changed() => {
                        if result.is_err() {
                            tracing::warn!("KvScheduler: config watch sender dropped, shutting down");
                            break;
                        }
                    }
149
150
                }

151
152
                // Get current workers from DashMap
                let current_workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> =
153
                    configs_monitor
154
155
156
157
158
159
160
161
162
163
                        .iter()
                        .map(|r| (*r.key(), r.value().clone()))
                        .collect();
                let current_worker_ids: HashSet<WorkerId> =
                    current_workers.keys().copied().collect();

                // Only update slots if workers have changed
                if current_worker_ids != last_workers {
                    slots_monitor.update_workers(current_workers);
                    last_workers = current_worker_ids;
Yan Ru Pei's avatar
Yan Ru Pei committed
164
165
166
167
168
169
170
171
                }
            }
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
172
173
174
175
        let hit_rate_publisher =
            EventPublisher::for_namespace(component.namespace(), KV_HIT_RATE_SUBJECT)
                .await
                .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?;
Yan Ru Pei's avatar
Yan Ru Pei committed
176
177
178
179
180
181
182
183
184
185
186

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
187
188
189
                }

                // Wait for a new request
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

206
207
                // Read the current workers configuration from DashMap
                let workers: HashMap<WorkerId, Option<ModelRuntimeConfig>> = workers_scheduler
208
                    .configs
209
210
211
                    .iter()
                    .map(|r| (*r.key(), r.value().clone()))
                    .collect();
Yan Ru Pei's avatar
Yan Ru Pei committed
212
213

                match selector.select_worker(&workers, &request, block_size) {
214
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
215
                        let event = KVHitRateEvent {
Yan Ru Pei's avatar
Yan Ru Pei committed
216
217
                            worker_id: selection.worker.worker_id,
                            dp_rank: selection.worker.dp_rank,
218
219
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
220
                        };
221
                        if let Err(e) = hit_rate_publisher.publish(&event).await {
Yan Ru Pei's avatar
Yan Ru Pei committed
222
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
223
                        }
224
225

                        let response = SchedulingResponse {
Yan Ru Pei's avatar
Yan Ru Pei committed
226
                            best_worker: selection.worker,
227
228
229
230
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

231
232
233
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
234
                        }
235

Yan Ru Pei's avatar
Yan Ru Pei committed
236
237
238
239
240
241
242
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

243
244
245
246
247
248
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
249
                                None, // expected_output_tokens not available in scheduler loop
Yan Ru Pei's avatar
Yan Ru Pei committed
250
                                selection.worker,
251
252
253
                            )
                            .await
                        {
254
                            tracing::warn!("Failed to add request {request_id}: {e}");
255
                        }
256
257
258
259
260
261
262
263
264
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
265
266
267
268
                    }
                }
            }

269
            tracing::trace!("background endpoint subscriber shutting down");
270
271
        });

272
        Ok(KvScheduler { request_tx, slots })
273
274
275
276
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
277
        maybe_request_id: Option<String>,
278
        isl_tokens: usize,
279
        token_seq: Option<Vec<SequenceHash>>,
280
        overlaps: OverlapScores,
281
        router_config_override: Option<&RouterConfigOverride>,
282
        update_states: bool,
Yan Ru Pei's avatar
Yan Ru Pei committed
283
    ) -> Result<WorkerWithDpRank, KvSchedulerError> {
284
285
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
286
            maybe_request_id,
287
            token_seq,
288
            isl_tokens,
289
            overlaps,
290
291
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
292
            router_config_override: router_config_override.cloned(),
293
            update_states,
294
            resp_tx: Some(resp_tx), // Wrap in Some()
295
        };
296

297
298
299
300
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
301
        let response = resp_rx
302
303
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
304

Yan Ru Pei's avatar
Yan Ru Pei committed
305
        Ok(response.best_worker)
306
307
    }

308
309
310
    pub async fn add_request(
        &self,
        request_id: String,
311
        token_sequence: Option<Vec<SequenceHash>>,
312
313
        isl: usize,
        overlap: u32,
314
        expected_output_tokens: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
315
        worker: WorkerWithDpRank,
316
317
    ) -> Result<(), SequenceError> {
        self.slots
318
319
320
321
322
323
324
325
            .add_request(
                request_id,
                token_sequence,
                isl,
                overlap,
                expected_output_tokens,
                worker,
            )
326
            .await
327
328
    }

329
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
330
        self.slots
331
            .mark_prefill_completed(&request_id.to_string())
332
            .await
333
334
    }

335
    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
336
        self.slots.free(&request_id.to_string()).await
337
    }
338

339
340
341
342
343
344
345
346
347
348
    pub async fn add_output_block(
        &self,
        request_id: &str,
        decay_fraction: Option<f64>,
    ) -> Result<(), SequenceError> {
        self.slots
            .add_output_block(&request_id.to_string(), decay_fraction)
            .await
    }

349
350
    pub async fn get_potential_loads(
        &self,
351
        token_seq: Option<Vec<SequenceHash>>,
352
353
354
355
356
357
358
359
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

Yan Ru Pei's avatar
Yan Ru Pei committed
360
361
362
363
        // Get all unique WorkerWithDpRank from both hashmaps
        let mut workers: HashSet<WorkerWithDpRank> = HashSet::new();
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());
364
365
366

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
367
        for worker in workers {
368
            loads.push(PotentialLoad {
Yan Ru Pei's avatar
Yan Ru Pei committed
369
370
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
371
                potential_prefill_tokens: prefill_tokens
Yan Ru Pei's avatar
Yan Ru Pei committed
372
                    .get(&worker)
373
374
                    .copied()
                    .unwrap_or(isl_tokens),
Yan Ru Pei's avatar
Yan Ru Pei committed
375
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
376
377
378
379
380
            });
        }

        loads
    }
381
382
}

383
// Helper function for softmax sampling
384
385
386
387
388
// Returns a vec of workers: multiple if tied, single if sampled
fn softmax_sample(
    logits: &HashMap<WorkerWithDpRank, f64>,
    temperature: f64,
) -> Vec<WorkerWithDpRank> {
389
390
391
392
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

393
    // Guard: if temperature is 0, return all keys with the smallest logit value (ties)
394
395
396
397
398
399
400
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
401
            .filter(|&(_, &v)| v == min_logit)
402
403
404
            .map(|(k, _)| *k)
            .collect();

405
        return min_keys;
406
407
    }

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
424
425
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
448
            return vec![keys[i]];
449
450
451
452
        }
    }

    // Fallback to last key (shouldn't normally reach here)
453
    vec![keys[keys.len() - 1]]
454
455
}

456
// Default implementation matching the Python _cost_function
457
458
459
460
461
462
463
464
465
466
467
468
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
469
470
471
472

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
473
        workers: &HashMap<WorkerId, Option<ModelRuntimeConfig>>,
474
        request: &SchedulingRequest,
475
        block_size: u32,
476
477
478
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

479
        if workers.is_empty() {
480
481
482
            return Err(KvSchedulerError::NoEndpoints);
        }

483
484
485
486
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

487
488
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
489

490
        let mut worker_logits = HashMap::new();
491

492
493
494
495
496
497
498
        // Use override if provided, otherwise use default config
        let overlap_weight = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.overlap_score_weight)
            .unwrap_or(self.kv_router_config.overlap_score_weight);

Yan Ru Pei's avatar
Yan Ru Pei committed
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
        // Calculate logits for each worker with dp_rank
        // Outer loop: iterate over all workers from runtime config
        // Inner loop: iterate over all dp_ranks for each worker
        for (worker_id, config) in workers.iter() {
            // Get data_parallel_size from runtime config
            // data_parallel_size defaults to 1 in ModelRuntimeConfig
            let data_parallel_size = config.as_ref().map(|c| c.data_parallel_size).unwrap_or(1); // Fallback if config is None

            // Iterate over all dp_ranks for this worker
            for dp_rank in 0..data_parallel_size {
                let worker = WorkerWithDpRank::new(*worker_id, dp_rank);

                // Get overlap for this worker (defaults to 0 if not in overlaps)
                let overlap = *overlaps.get(&worker).unwrap_or(&0);

                // this is the number of prefill tokens the worker would have if the request were scheduled there
                let prefill_token = *prefill_tokens.get(&worker).unwrap_or(&isl);
                let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

                // this is the number of decode blocks the worker would have if the request were scheduled there
                let decode_block = *decode_blocks
                    .get(&worker)
                    .unwrap_or(&(potential_prefill_block.floor() as usize))
                    as f64;

                // Calculate logit (lower is better)
                let logit = overlap_weight * potential_prefill_block + decode_block;

                worker_logits.insert(worker, logit);

                tracing::info!(
                    "Formula for worker_id={} dp_rank={:?} with {overlap} cached blocks: {logit:.3} \
                     = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                     = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}",
                    worker.worker_id,
                    worker.dp_rank
                );
            }
537
538
        }

539
        // Use softmax sampling to select worker(s)
540
541
542
543
544
545
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
        let candidates = softmax_sample(&worker_logits, temperature);

        // If multiple candidates (tied), use tree size as tie-breaker
        // If tree sizes are also equal, min_by_key uses HashMap iteration order (pseudo-random)
        let best_worker = if candidates.len() > 1 {
            tracing::info!("Multiple workers tied with same logit, using tree size as tie-breaker");
            *candidates
                .iter()
                .min_by_key(|worker| {
                    request
                        .overlaps
                        .tree_sizes
                        .get(worker)
                        .copied()
                        .unwrap_or(0)
                })
                .expect("candidates should not be empty")
        } else {
            candidates[0]
        };

Yan Ru Pei's avatar
Yan Ru Pei committed
567
568
569
        let best_logit = worker_logits[&best_worker];

        let best_overlap = *overlaps.get(&best_worker).unwrap_or(&0);
570

Yan Ru Pei's avatar
Yan Ru Pei committed
571
        // this is a runtime config set on a per worker basis, not per dp-rank
572
        let total_blocks_info = workers
Yan Ru Pei's avatar
Yan Ru Pei committed
573
            .get(&best_worker.worker_id)
574
575
576
577
578
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

579
580
581
582
583
584
585
        let tree_size = request
            .overlaps
            .tree_sizes
            .get(&best_worker)
            .copied()
            .unwrap_or(0);

586
        tracing::info!(
587
            "Selected worker: worker_id={} dp_rank={:?}, logit: {:.3}, cached blocks: {}, tree size: {}{}",
Yan Ru Pei's avatar
Yan Ru Pei committed
588
589
            best_worker.worker_id,
            best_worker.dp_rank,
590
591
            best_logit,
            best_overlap,
592
            tree_size,
593
            total_blocks_info
594
        );
595
596

        Ok(WorkerSelectionResult {
Yan Ru Pei's avatar
Yan Ru Pei committed
597
            worker: best_worker,
598
            required_blocks: request_blocks as u64,
Yan Ru Pei's avatar
Yan Ru Pei committed
599
            overlap_blocks: overlaps.get(&best_worker).copied().unwrap_or(0),
600
        })
601
602
    }
}
603
604
605
606
607

#[cfg(test)]
mod tests {
    use super::*;

608
609
610
611
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
612
613
        let worker = WorkerWithDpRank::from_worker_id(42);
        logits.insert(worker, 0.5); // The value doesn't matter
614
615
616
617

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
618
619
            assert_eq!(result.len(), 1, "Should return exactly one worker");
            assert_eq!(result[0], worker, "Should return the only available worker");
620
621
622
623
        }

        // Test with different logit values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
624
        logits.insert(worker, -100.0); // Very negative value
625
626
627
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
628
629

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
630
        logits.insert(worker, 100.0); // Very positive value
631
632
633
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
634
635

        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
636
        logits.insert(worker, 0.0); // Zero value
637
638
639
        let result = softmax_sample(&logits, 1.0);
        assert_eq!(result.len(), 1);
        assert_eq!(result[0], worker);
640
641
    }

642
643
    #[test]
    fn test_softmax_sample_zero_temperature() {
644
        // Test that with temperature 0, softmax_sample returns all keys with smallest logit
645
        let mut logits = HashMap::new();
Yan Ru Pei's avatar
Yan Ru Pei committed
646
647
648
649
650
651
652
653
        let worker1 = WorkerWithDpRank::from_worker_id(1);
        let worker2 = WorkerWithDpRank::from_worker_id(2);
        let worker3 = WorkerWithDpRank::from_worker_id(3);
        let worker4 = WorkerWithDpRank::from_worker_id(4);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // This has the smallest logit
        logits.insert(worker3, 7.0);
        logits.insert(worker4, 3.5);
654

655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
        // With temperature 0, should always return only worker2 (smallest logit)
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            1,
            "Should return one worker when there's no tie"
        );
        assert_eq!(
            result[0], worker2,
            "Should return worker with smallest logit when temperature is 0"
        );

        // Test with tied minimum logits
        logits.clear();
        let worker5 = WorkerWithDpRank::from_worker_id(5);
        let worker6 = WorkerWithDpRank::from_worker_id(6);
        logits.insert(worker1, 5.0);
        logits.insert(worker2, 3.0); // Tied for smallest
        logits.insert(worker5, 3.0); // Tied for smallest
        logits.insert(worker6, 7.0);

        let result = softmax_sample(&logits, 0.0);
        assert_eq!(
            result.len(),
            2,
            "Should return all workers with smallest logit when tied"
        );
        assert!(
            result.contains(&worker2) && result.contains(&worker5),
            "Should contain both tied workers"
        );
686

687
688
        // Test with negative values
        logits.clear();
Yan Ru Pei's avatar
Yan Ru Pei committed
689
690
691
692
693
694
        let worker10 = WorkerWithDpRank::from_worker_id(10);
        let worker20 = WorkerWithDpRank::from_worker_id(20);
        let worker30 = WorkerWithDpRank::from_worker_id(30);
        logits.insert(worker10, -1.0);
        logits.insert(worker20, -5.0); // This has the smallest logit
        logits.insert(worker30, 0.0);
695

696
        let result = softmax_sample(&logits, 0.0);
697
698
699
700
701
        assert_eq!(result.len(), 1);
        assert_eq!(
            result[0], worker20,
            "Should handle negative logits correctly"
        );
702
703
    }
}