scheduler.rs 22.6 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::local_model::runtime_config::ModelRuntimeConfig;
5
use anyhow::Result;
6
use dynamo_runtime::component::{Component, Instance};
Yan Ru Pei's avatar
Yan Ru Pei committed
7
use dynamo_runtime::traits::DistributedRuntimeProvider;
Neelay Shah's avatar
Neelay Shah committed
8
use dynamo_runtime::traits::events::EventPublisher;
9
use rand::Rng;
10
use serde::{Deserialize, Serialize};
11
use std::collections::{HashMap, HashSet};
12
13
use std::sync::Arc;
use std::time::Duration;
Yan Ru Pei's avatar
Yan Ru Pei committed
14
use tokio::sync::{RwLock, watch};
15

16
17
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
18
use super::RouterConfigOverride;
19
use super::WorkerSelector;
20
use super::indexer::OverlapScores;
21
use super::protocols::WorkerSelectionResult;
22
23
use super::sequence::ActiveSequencesMultiWorker;

24
use crate::tokens::SequenceHash;
25

26
27
28
29
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
    pub worker_id: i64,
    pub isl_blocks: usize,
30
    pub overlap_blocks: u32,
31
32
}

33
34
35
36
37
38
39
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
    pub worker_id: i64,
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

40
41
42
43
44
45
46
47
48
49
50
51
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
    #[error("no endpoints aviailable to route work")]
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

52
53
54
#[derive(Debug)]
pub struct SchedulingResponse {
    pub best_worker_id: i64,
55
    pub overlap_blocks: u32,
56
57
}

58
pub struct SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
59
    pub maybe_request_id: Option<String>,
60
    pub token_seq: Option<Vec<SequenceHash>>,
61
    pub isl_tokens: usize,
62
    pub overlaps: OverlapScores,
63
64
    pub decode_blocks: HashMap<i64, usize>,
    pub prefill_tokens: HashMap<i64, usize>,
65
66
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
67
68
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
69
70
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
71
72
73
}

impl SchedulingRequest {
74
75
76
77
78
79
80
81
82
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
83
84
85
86
87
88
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
89
    slots: Arc<ActiveSequencesMultiWorker>,
90
91
92
93
}

impl KvScheduler {
    pub async fn start(
94
        component: Component,
95
        block_size: u32,
Yan Ru Pei's avatar
Yan Ru Pei committed
96
97
        instances_rx: watch::Receiver<Vec<Instance>>,
        runtime_configs_rx: watch::Receiver<HashMap<i64, ModelRuntimeConfig>>,
98
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
99
        replica_sync: bool,
100
        router_uuid: String,
101
    ) -> Result<Self, KvSchedulerError> {
102
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
Yan Ru Pei's avatar
Yan Ru Pei committed
103
104
        let instances: Vec<Instance> = instances_rx.borrow().clone();
        let runtime_configs: HashMap<i64, ModelRuntimeConfig> = runtime_configs_rx.borrow().clone();
105

Yan Ru Pei's avatar
Yan Ru Pei committed
106
107
108
109
110
111
112
113
        // Create shared workers_with_configs wrapped in Arc<RwLock>
        let workers_with_configs: Arc<RwLock<HashMap<i64, Option<ModelRuntimeConfig>>>> = {
            let mut initial_map = HashMap::new();
            for instance in &instances {
                let worker_id = instance.instance_id;
                let config = runtime_configs.get(&worker_id).cloned();
                if config.is_some() {
                    tracing::info!("Runtime config found for worker_id: {}", worker_id);
114
                }
Yan Ru Pei's avatar
Yan Ru Pei committed
115
                initial_map.insert(worker_id, config);
116
            }
Yan Ru Pei's avatar
Yan Ru Pei committed
117
118
            Arc::new(RwLock::new(initial_map))
        };
119

120
121
122
123
124
        let worker_ids: Vec<i64> = instances
            .iter()
            .map(|instance| instance.instance_id)
            .collect();
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
Yan Ru Pei's avatar
Yan Ru Pei committed
125
            component.clone(),
126
            block_size as usize,
127
            worker_ids,
128
            replica_sync,
129
            router_uuid,
130
        ));
131

Yan Ru Pei's avatar
Yan Ru Pei committed
132
133
134
135
136
137
        // Spawn background task to monitor and update workers_with_configs
        let workers_monitor = workers_with_configs.clone();
        let slots_monitor = slots.clone();
        let mut instances_monitor_rx = instances_rx.clone();
        let mut configs_monitor_rx = runtime_configs_rx.clone();
        let monitor_cancel_token = component.drt().primary_token();
138
        tokio::spawn(async move {
Yan Ru Pei's avatar
Yan Ru Pei committed
139
            tracing::trace!("workers monitoring task started");
140
            loop {
Yan Ru Pei's avatar
Yan Ru Pei committed
141
142
143
144
                // Wait for either instances or configs to change
                tokio::select! {
                    _ = monitor_cancel_token.cancelled() => {
                        tracing::trace!("workers monitoring task shutting down");
145
146
                        break;
                    }
Yan Ru Pei's avatar
Yan Ru Pei committed
147
148
149
150
151
                    result = instances_monitor_rx.changed() => {
                        if result.is_err() {
                            tracing::warn!("endpoint watch sender shutdown in monitor");
                            break;
                        }
152
                    }
Yan Ru Pei's avatar
Yan Ru Pei committed
153
154
155
156
157
                    result = configs_monitor_rx.changed() => {
                        if result.is_err() {
                            tracing::warn!("runtime configs watch sender shutdown in monitor");
                            break;
                        }
158
159
160
                    }
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
                // Get the latest values from both channels
                let new_instances = instances_monitor_rx.borrow_and_update().clone();
                let new_configs = configs_monitor_rx.borrow_and_update().clone();

                // Update workers when instances change
                let worker_ids: Vec<i64> = new_instances
                    .iter()
                    .map(|instance| instance.instance_id)
                    .collect();
                slots_monitor.update_workers(worker_ids);

                // Update the shared workers_with_configs
                let mut workers_map = workers_monitor.write().await;
                workers_map.clear();
                for instance in &new_instances {
                    let worker_id = instance.instance_id;
                    let config = new_configs.get(&worker_id).cloned();
                    if config.is_some() {
                        tracing::info!("Runtime config found for worker_id: {}", worker_id);
180
                    }
Yan Ru Pei's avatar
Yan Ru Pei committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
                    workers_map.insert(worker_id, config);
                }
                tracing::trace!(
                    "Updated workers_with_configs with {} workers",
                    workers_map.len()
                );
            }
            tracing::trace!("workers monitoring task shutting down");
        });

        let slots_clone = slots.clone();
        let workers_scheduler = workers_with_configs.clone();
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
        let scheduler_cancel_token = component.drt().primary_token();
        let ns_clone = component.namespace().clone();

        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
            tracing::trace!("scheduler background task started");

            loop {
                // Check for cancellation at beginning of loop
                if scheduler_cancel_token.is_cancelled() {
                    tracing::trace!("scheduler background task shutting down");
                    break;
207
208
209
                }

                // Wait for a new request
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

Yan Ru Pei's avatar
Yan Ru Pei committed
226
227
228
229
                // Read the current workers configuration
                let workers = workers_scheduler.read().await.clone();

                match selector.select_worker(&workers, &request, block_size) {
230
                    Ok(selection) => {
Yan Ru Pei's avatar
Yan Ru Pei committed
231
                        let event = KVHitRateEvent {
232
233
234
                            worker_id: selection.worker_id,
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
Yan Ru Pei's avatar
Yan Ru Pei committed
235
236
237
                        };
                        if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
                            tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
238
                        }
239
240
241
242
243
244
245

                        let response = SchedulingResponse {
                            best_worker_id: selection.worker_id,
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

246
247
248
                        // Skip state update if not requested
                        if !request.update_states {
                            continue;
249
                        }
250

Yan Ru Pei's avatar
Yan Ru Pei committed
251
252
253
254
255
256
257
                        let Some(request_id) = request.maybe_request_id else {
                            tracing::error!(
                                "No request_id provided to add_request to the slot tracker"
                            );
                            continue;
                        };

258
259
260
261
262
263
264
265
266
267
268
269
270
271
                        if let Err(e) = slots_clone
                            .add_request(
                                request_id.clone(),
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
                                selection.worker_id,
                            )
                            .await
                        {
                            tracing::warn!(
                                "Failed to add request {request_id} to local slot tracker: {e:?}"
                            );
                        }
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
287
288
289
290
                    }
                }
            }

291
            tracing::trace!("background endpoint subscriber shutting down");
292
293
        });

294
        Ok(KvScheduler { request_tx, slots })
295
296
297
298
    }

    pub async fn schedule(
        &self,
Yan Ru Pei's avatar
Yan Ru Pei committed
299
        maybe_request_id: Option<String>,
300
        isl_tokens: usize,
301
        token_seq: Option<Vec<SequenceHash>>,
302
        overlaps: OverlapScores,
303
        router_config_override: Option<&RouterConfigOverride>,
304
        update_states: bool,
GuanLuo's avatar
GuanLuo committed
305
    ) -> Result<i64, KvSchedulerError> {
306
307
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
Yan Ru Pei's avatar
Yan Ru Pei committed
308
            maybe_request_id,
309
            token_seq,
310
            isl_tokens,
311
            overlaps,
312
313
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
314
            router_config_override: router_config_override.cloned(),
315
            update_states,
316
            resp_tx: Some(resp_tx), // Wrap in Some()
317
        };
318

319
320
321
322
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
323
        let response = resp_rx
324
325
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
326

327
328
        let best_worker_id = response.best_worker_id;
        Ok(best_worker_id)
329
330
    }

331
332
333
    pub async fn add_request(
        &self,
        request_id: String,
334
        token_sequence: Option<Vec<SequenceHash>>,
335
336
337
338
339
340
341
342
343
344
        isl: usize,
        overlap: u32,
        worker_id: i64,
    ) {
        let _ = self
            .slots
            .add_request(request_id, token_sequence, isl, overlap, worker_id)
            .await;
    }

345
346
    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<()> {
        self.slots
347
            .mark_prefill_completed(&request_id.to_string())
348
            .await
349
350
    }

351
352
    pub async fn free(&self, request_id: &str) -> Result<()> {
        self.slots.free(&request_id.to_string()).await
353
    }
354
355
356

    pub async fn get_potential_loads(
        &self,
357
        token_seq: Option<Vec<SequenceHash>>,
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

        // Get all unique worker IDs from both hashmaps
        let mut worker_ids: HashSet<i64> = HashSet::new();
        worker_ids.extend(decode_blocks.keys().copied());
        worker_ids.extend(prefill_tokens.keys().copied());

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
        for worker_id in worker_ids {
            loads.push(PotentialLoad {
                worker_id,
                potential_prefill_tokens: prefill_tokens
                    .get(&worker_id)
                    .copied()
                    .unwrap_or(isl_tokens),
                potential_decode_blocks: decode_blocks.get(&worker_id).copied().unwrap_or(0),
            });
        }

        loads
    }
386
387
}

388
389
390
391
392
393
// Helper function for softmax sampling
fn softmax_sample(logits: &HashMap<i64, f64>, temperature: f64) -> i64 {
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

394
395
396
397
398
399
400
401
    // Guard: if temperature is 0, return the key with the smallest logit value
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
402
            .filter(|&(_, &v)| v == min_logit)
403
404
405
406
407
408
409
410
411
            .map(|(k, _)| *k)
            .collect();

        // Randomly select from the minimum keys (handles single key case naturally)
        let mut rng = rand::rng();
        let index = rng.random_range(0..min_keys.len());
        return min_keys[index];
    }

412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
428
429
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
            return keys[i];
        }
    }

    // Fallback to last key (shouldn't normally reach here)
    keys[keys.len() - 1]
}

460
// Default implementation matching the Python _cost_function
461
462
463
464
465
466
467
468
469
470
471
472
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
473
474
475
476

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
477
        workers: &HashMap<i64, Option<ModelRuntimeConfig>>,
478
        request: &SchedulingRequest,
479
        block_size: u32,
480
481
482
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

483
        if workers.is_empty() {
484
485
486
            return Err(KvSchedulerError::NoEndpoints);
        }

487
488
489
490
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

491
492
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
493

494
        let mut worker_logits = HashMap::new();
495
        let mut max_logit = f64::NEG_INFINITY;
496

497
        // Calculate logits for each worker
498
499
        for worker_id in workers.keys() {
            let overlap = *overlaps.get(worker_id).unwrap_or(&0);
500
501

            // this is the number of prefill tokens the worker would have if the request were scheduled there
502
            let prefill_token = *prefill_tokens.get(worker_id).unwrap_or(&isl);
503
504
505
506
            let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

            // this is the number of decode blocks the worker would have if the request were scheduled there
            let decode_block = *decode_blocks
507
                .get(worker_id)
508
509
                .unwrap_or(&(potential_prefill_block.floor() as usize))
                as f64;
510

511
512
513
514
515
516
517
            // Use override if provided, otherwise use default config
            let overlap_weight = request
                .router_config_override
                .as_ref()
                .and_then(|cfg| cfg.overlap_score_weight)
                .unwrap_or(self.kv_router_config.overlap_score_weight);

518
            // Calculate logit (lower is better)
519
            let logit = overlap_weight * potential_prefill_block + decode_block;
520
            max_logit = max_logit.max(logit);
521

522
            worker_logits.insert(*worker_id, logit);
523
524

            tracing::info!(
525
526
527
                "Formula for {worker_id} with {overlap} cached blocks: {logit:.3} \
                 = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                 = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}"
528
529
530
            );
        }

531
        // Use softmax sampling to select worker
532
533
534
535
536
537
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
538
539
540
        let best_worker_id = softmax_sample(&worker_logits, temperature);
        let best_logit = worker_logits[&best_worker_id];

541
542
543
544
545
546
547
548
        let best_overlap = *overlaps.get(&best_worker_id).unwrap_or(&0);
        let total_blocks_info = workers
            .get(&best_worker_id)
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

549
        tracing::info!(
550
            "Selected worker: {}, logit: {:.3}, cached blocks: {}{}",
551
            best_worker_id,
552
553
554
            best_logit,
            best_overlap,
            total_blocks_info
555
        );
556
557

        Ok(WorkerSelectionResult {
558
559
            worker_id: best_worker_id,
            required_blocks: request_blocks as u64,
560
            overlap_blocks: overlaps.get(&best_worker_id).copied().unwrap_or(0),
561
        })
562
563
    }
}
564
565
566
567
568

#[cfg(test)]
mod tests {
    use super::*;

569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
        let worker_id = 42;
        logits.insert(worker_id, 0.5); // The value doesn't matter

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
            assert_eq!(result, worker_id, "Should return the only available worker");
        }

        // Test with different logit values
        logits.clear();
        logits.insert(worker_id, -100.0); // Very negative value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 100.0); // Very positive value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 0.0); // Zero value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);
    }

596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
    #[test]
    fn test_softmax_sample_zero_temperature() {
        // Test that with temperature 0, softmax_sample returns the key with smallest logit
        let mut logits = HashMap::new();
        logits.insert(1, 5.0);
        logits.insert(2, 3.0); // This has the smallest logit
        logits.insert(3, 7.0);
        logits.insert(4, 3.5);

        // With temperature 0, should always return worker 2 (smallest logit)
        for _ in 0..10 {
            let result = softmax_sample(&logits, 0.0);
            assert_eq!(
                result, 2,
                "Should return worker with smallest logit when temperature is 0"
611
612
613
            );
        }

614
615
616
617
618
        // Test with negative values
        logits.clear();
        logits.insert(10, -1.0);
        logits.insert(20, -5.0); // This has the smallest logit
        logits.insert(30, 0.0);
619

620
621
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(result, 20, "Should handle negative logits correctly");
622
623
    }
}