scheduler.rs 20.7 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::local_model::runtime_config::ModelRuntimeConfig;
5
use dynamo_runtime::component::{Component, Instance};
Neelay Shah's avatar
Neelay Shah committed
6
use dynamo_runtime::traits::events::EventPublisher;
7
use rand::Rng;
8
use serde::{Deserialize, Serialize};
9
use std::collections::{HashMap, HashSet};
10
11
use std::sync::Arc;
use std::time::Duration;
12
use tokio::sync::watch;
13

14
15
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
16
use super::RouterConfigOverride;
17
use super::WorkerSelector;
18
use super::indexer::OverlapScores;
19
use super::protocols::WorkerSelectionResult;
20
21
use super::sequence::ActiveSequencesMultiWorker;

22
use crate::tokens::SequenceHash;
23

24
25
26
27
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
    pub worker_id: i64,
    pub isl_blocks: usize,
28
    pub overlap_blocks: u32,
29
30
}

31
32
33
34
35
36
37
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad {
    pub worker_id: i64,
    pub potential_prefill_tokens: usize,
    pub potential_decode_blocks: usize,
}

38
39
40
41
42
43
44
45
46
47
48
49
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
    #[error("no endpoints aviailable to route work")]
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

50
51
52
#[derive(Debug)]
pub struct SchedulingResponse {
    pub best_worker_id: i64,
53
    pub overlap_blocks: u32,
54
55
}

56
pub struct SchedulingRequest {
57
58
    pub request_id: String,
    pub token_seq: Vec<SequenceHash>,
59
    pub isl_tokens: usize,
60
    pub overlaps: OverlapScores,
61
62
    pub decode_blocks: HashMap<i64, usize>,
    pub prefill_tokens: HashMap<i64, usize>,
63
64
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
65
66
    // Whether to update scheduler states (false for query_instance_id requests)
    pub update_states: bool,
67
68
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
69
70
71
}

impl SchedulingRequest {
72
73
74
75
76
77
78
79
80
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
81
82
83
84
85
86
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
87
    slots: Arc<ActiveSequencesMultiWorker>,
88
89
90
91
}

impl KvScheduler {
    pub async fn start(
92
        component: Component,
93
        block_size: u32,
94
95
        mut instances_rx: watch::Receiver<Vec<Instance>>,
        mut runtime_configs_rx: watch::Receiver<HashMap<i64, ModelRuntimeConfig>>,
96
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
97
        replica_sync: bool,
98
    ) -> Result<Self, KvSchedulerError> {
99
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
100
        let mut instances: Vec<Instance> = instances_rx.borrow_and_update().clone();
101
102
        let mut runtime_configs: HashMap<i64, ModelRuntimeConfig> =
            runtime_configs_rx.borrow_and_update().clone();
103

104
        let (event_tx, event_rx) = tokio::sync::mpsc::unbounded_channel::<KVHitRateEvent>();
105
        let ns_clone = component.namespace().clone();
106
107
108
        tokio::spawn(async move {
            let mut event_rx = event_rx;
            while let Some(event) = event_rx.recv().await {
109
                if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
110
111
112
113
114
                    tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
                }
            }
        });

115
116
117
118
119
120
        let worker_ids: Vec<i64> = instances
            .iter()
            .map(|instance| instance.instance_id)
            .collect();
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
            component,
121
            block_size as usize,
122
            worker_ids,
123
124
            replica_sync,
        ));
125

126
        let slots_clone = slots.clone();
127
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
128
129
130
        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
131
            tracing::trace!("scheduler background task started");
132
133
            let mut workers_with_configs: HashMap<i64, Option<ModelRuntimeConfig>> = HashMap::new();
            let mut needs_rebuild = true;
134

135
            loop {
136
137
138
139
140
                // Check for instance updates (non-blocking)
                let instances_changed = instances_rx.has_changed();
                let configs_changed = runtime_configs_rx.has_changed();

                match instances_changed {
141
                    Ok(true) => {
142
                        instances = instances_rx.borrow_and_update().clone();
143
144
145
146
147
                        let worker_ids: Vec<i64> = instances
                            .iter()
                            .map(|instance| instance.instance_id)
                            .collect();
                        slots_clone.update_workers(worker_ids);
148
                        needs_rebuild = true;
149
                    }
150
                    Ok(false) => {}
151
152
153
154
155
                    Err(_) => {
                        tracing::warn!("endpoint watch sender shutdown");
                        break;
                    }
                }
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
                // Check for runtime config updates
                match configs_changed {
                    Ok(true) => {
                        runtime_configs = runtime_configs_rx.borrow_and_update().clone();
                        needs_rebuild = true;
                    }
                    Ok(false) => {}
                    Err(_) => {
                        tracing::warn!("runtime configs watch sender shutdown");
                    }
                }

                // Rebuild workers hashmap only when needed
                if needs_rebuild {
                    workers_with_configs.clear();
                    for instance in &instances {
                        let worker_id = instance.instance_id;
                        let config = runtime_configs.get(&worker_id).cloned();
                        if config.is_none() {
                            tracing::warn!("Runtime config not found for worker_id: {}", worker_id);
                        }
                        workers_with_configs.insert(worker_id, config);
                    }
                    needs_rebuild = false;
                }

                // Wait for a new request
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

200
                match selector.select_worker(&workers_with_configs, &request, block_size) {
201
202
203
204
205
206
207
                    Ok(selection) => {
                        if let Err(e) = event_tx.send(KVHitRateEvent {
                            worker_id: selection.worker_id,
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
                        }) {
                            tracing::warn!("Failed to send KV hit rate event: {:?}", e);
208
                        }
209
210
211
212
213
214
215

                        let response = SchedulingResponse {
                            best_worker_id: selection.worker_id,
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

216
217
218
219
220
221
222
223
224
225
226
227
                        // Only update the state if update_states is true
                        if request.update_states {
                            let _ = slots_clone
                                .add_request(
                                    request.request_id,
                                    request.token_seq,
                                    request.isl_tokens,
                                    selection.overlap_blocks,
                                    selection.worker_id,
                                )
                                .await;
                        }
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244

                        continue;
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
245
246
247
248
                    }
                }
            }

249
            tracing::trace!("background endpoint subscriber shutting down");
250
251
        });

252
        Ok(KvScheduler { request_tx, slots })
253
254
255
256
    }

    pub async fn schedule(
        &self,
257
        request_id: String,
258
        isl_tokens: usize,
259
        token_seq: Vec<SequenceHash>,
260
        overlaps: OverlapScores,
261
        router_config_override: Option<&RouterConfigOverride>,
262
        update_states: bool,
GuanLuo's avatar
GuanLuo committed
263
    ) -> Result<i64, KvSchedulerError> {
264
265
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
266
267
            request_id,
            token_seq,
268
            isl_tokens,
269
            overlaps,
270
271
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
272
            router_config_override: router_config_override.cloned(),
273
            update_states,
274
            resp_tx: Some(resp_tx), // Wrap in Some()
275
        };
276

277
278
279
280
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
281
        let response = resp_rx
282
283
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
284

285
286
        let best_worker_id = response.best_worker_id;
        Ok(best_worker_id)
287
288
    }

289
290
291
292
293
294
295
296
297
298
299
300
301
302
    pub async fn add_request(
        &self,
        request_id: String,
        token_sequence: Vec<SequenceHash>,
        isl: usize,
        overlap: u32,
        worker_id: i64,
    ) {
        let _ = self
            .slots
            .add_request(request_id, token_sequence, isl, overlap, worker_id)
            .await;
    }

303
304
305
306
307
    pub async fn mark_prefill_completed(&self, request_id: &str) {
        let _ = self
            .slots
            .mark_prefill_completed(&request_id.to_string())
            .await;
308
309
    }

310
311
    pub async fn free(&self, request_id: &str) {
        let _ = self.slots.free(&request_id.to_string()).await;
312
    }
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344

    pub async fn get_potential_loads(
        &self,
        token_seq: Vec<SequenceHash>,
        isl_tokens: usize,
        overlaps: OverlapScores,
    ) -> Vec<PotentialLoad> {
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens(token_seq, isl_tokens, overlaps)
            .await;

        // Get all unique worker IDs from both hashmaps
        let mut worker_ids: HashSet<i64> = HashSet::new();
        worker_ids.extend(decode_blocks.keys().copied());
        worker_ids.extend(prefill_tokens.keys().copied());

        // Create PotentialLoad for each worker
        let mut loads = Vec::new();
        for worker_id in worker_ids {
            loads.push(PotentialLoad {
                worker_id,
                potential_prefill_tokens: prefill_tokens
                    .get(&worker_id)
                    .copied()
                    .unwrap_or(isl_tokens),
                potential_decode_blocks: decode_blocks.get(&worker_id).copied().unwrap_or(0),
            });
        }

        loads
    }
345
346
}

347
348
349
350
351
352
// Helper function for softmax sampling
fn softmax_sample(logits: &HashMap<i64, f64>, temperature: f64) -> i64 {
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

353
354
355
356
357
358
359
360
    // Guard: if temperature is 0, return the key with the smallest logit value
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
361
            .filter(|&(_, &v)| v == min_logit)
362
363
364
365
366
367
368
369
370
            .map(|(k, _)| *k)
            .collect();

        // Randomly select from the minimum keys (handles single key case naturally)
        let mut rng = rand::rng();
        let index = rng.random_range(0..min_keys.len());
        return min_keys[index];
    }

371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
387
388
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
            return keys[i];
        }
    }

    // Fallback to last key (shouldn't normally reach here)
    keys[keys.len() - 1]
}

419
// Default implementation matching the Python _cost_function
420
421
422
423
424
425
426
427
428
429
430
431
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
432
433
434
435

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
436
        workers: &HashMap<i64, Option<ModelRuntimeConfig>>,
437
        request: &SchedulingRequest,
438
        block_size: u32,
439
440
441
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

442
        if workers.is_empty() {
443
444
445
            return Err(KvSchedulerError::NoEndpoints);
        }

446
447
448
449
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

450
451
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
452

453
        let mut worker_logits = HashMap::new();
454
        let mut max_logit = f64::NEG_INFINITY;
455

456
        // Calculate logits for each worker
457
458
        for worker_id in workers.keys() {
            let overlap = *overlaps.get(worker_id).unwrap_or(&0);
459
460

            // this is the number of prefill tokens the worker would have if the request were scheduled there
461
            let prefill_token = *prefill_tokens.get(worker_id).unwrap_or(&isl);
462
463
464
465
            let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

            // this is the number of decode blocks the worker would have if the request were scheduled there
            let decode_block = *decode_blocks
466
                .get(worker_id)
467
468
                .unwrap_or(&(potential_prefill_block.floor() as usize))
                as f64;
469

470
471
472
473
474
475
476
            // Use override if provided, otherwise use default config
            let overlap_weight = request
                .router_config_override
                .as_ref()
                .and_then(|cfg| cfg.overlap_score_weight)
                .unwrap_or(self.kv_router_config.overlap_score_weight);

477
            // Calculate logit (lower is better)
478
            let logit = overlap_weight * potential_prefill_block + decode_block;
479
            max_logit = max_logit.max(logit);
480

481
            worker_logits.insert(*worker_id, logit);
482
483

            tracing::info!(
484
485
486
                "Formula for {worker_id} with {overlap} cached blocks: {logit:.3} \
                 = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                 = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}"
487
488
489
            );
        }

490
        // Use softmax sampling to select worker
491
492
493
494
495
496
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
497
498
499
        let best_worker_id = softmax_sample(&worker_logits, temperature);
        let best_logit = worker_logits[&best_worker_id];

500
501
502
503
504
505
506
507
        let best_overlap = *overlaps.get(&best_worker_id).unwrap_or(&0);
        let total_blocks_info = workers
            .get(&best_worker_id)
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

508
        tracing::info!(
509
            "Selected worker: {}, logit: {:.3}, cached blocks: {}{}",
510
            best_worker_id,
511
512
513
            best_logit,
            best_overlap,
            total_blocks_info
514
        );
515
516

        Ok(WorkerSelectionResult {
517
518
            worker_id: best_worker_id,
            required_blocks: request_blocks as u64,
519
            overlap_blocks: overlaps.get(&best_worker_id).copied().unwrap_or(0),
520
        })
521
522
    }
}
523
524
525
526
527

#[cfg(test)]
mod tests {
    use super::*;

528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
        let worker_id = 42;
        logits.insert(worker_id, 0.5); // The value doesn't matter

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
            assert_eq!(result, worker_id, "Should return the only available worker");
        }

        // Test with different logit values
        logits.clear();
        logits.insert(worker_id, -100.0); // Very negative value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 100.0); // Very positive value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 0.0); // Zero value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);
    }

555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
    #[test]
    fn test_softmax_sample_zero_temperature() {
        // Test that with temperature 0, softmax_sample returns the key with smallest logit
        let mut logits = HashMap::new();
        logits.insert(1, 5.0);
        logits.insert(2, 3.0); // This has the smallest logit
        logits.insert(3, 7.0);
        logits.insert(4, 3.5);

        // With temperature 0, should always return worker 2 (smallest logit)
        for _ in 0..10 {
            let result = softmax_sample(&logits, 0.0);
            assert_eq!(
                result, 2,
                "Should return worker with smallest logit when temperature is 0"
570
571
572
            );
        }

573
574
575
576
577
        // Test with negative values
        logits.clear();
        logits.insert(10, -1.0);
        logits.insert(20, -5.0); // This has the smallest logit
        logits.insert(30, 0.0);
578

579
580
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(result, 20, "Should handle negative logits correctly");
581
582
    }
}