scheduler.rs 18.8 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
3

4
use crate::local_model::runtime_config::ModelRuntimeConfig;
5
use dynamo_runtime::component::{Component, Instance};
Neelay Shah's avatar
Neelay Shah committed
6
use dynamo_runtime::traits::events::EventPublisher;
7
use rand::Rng;
8
use serde::{Deserialize, Serialize};
9
use std::collections::HashMap;
10
11
use std::sync::Arc;
use std::time::Duration;
12
use tokio::sync::watch;
13

14
15
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig;
16
use super::RouterConfigOverride;
17
use super::WorkerSelector;
18
use super::indexer::OverlapScores;
19
use super::protocols::WorkerSelectionResult;
20
21
use super::sequence::ActiveSequencesMultiWorker;

22
use crate::tokens::SequenceHash;
23

24
25
26
27
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
    pub worker_id: i64,
    pub isl_blocks: usize,
28
    pub overlap_blocks: u32,
29
30
}

31
32
33
34
35
36
37
38
39
40
41
42
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
    #[error("no endpoints aviailable to route work")]
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

43
44
45
#[derive(Debug)]
pub struct SchedulingResponse {
    pub best_worker_id: i64,
46
    pub overlap_blocks: u32,
47
48
}

49
pub struct SchedulingRequest {
50
51
    pub request_id: String,
    pub token_seq: Vec<SequenceHash>,
52
    pub isl_tokens: usize,
53
    pub overlaps: OverlapScores,
54
55
    pub decode_blocks: HashMap<i64, usize>,
    pub prefill_tokens: HashMap<i64, usize>,
56
57
    // Router config overrides for this specific request
    pub router_config_override: Option<RouterConfigOverride>,
58
59
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
60
61
62
}

impl SchedulingRequest {
63
64
65
66
67
68
69
70
71
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
72
73
74
75
76
77
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
78
    slots: Arc<ActiveSequencesMultiWorker>,
79
80
81
82
}

impl KvScheduler {
    pub async fn start(
83
        component: Component,
84
        block_size: u32,
85
86
        mut instances_rx: watch::Receiver<Vec<Instance>>,
        mut runtime_configs_rx: watch::Receiver<HashMap<i64, ModelRuntimeConfig>>,
87
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
88
        replica_sync: bool,
89
    ) -> Result<Self, KvSchedulerError> {
90
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
91
        let mut instances: Vec<Instance> = instances_rx.borrow_and_update().clone();
92
93
        let mut runtime_configs: HashMap<i64, ModelRuntimeConfig> =
            runtime_configs_rx.borrow_and_update().clone();
94

95
        let (event_tx, event_rx) = tokio::sync::mpsc::unbounded_channel::<KVHitRateEvent>();
96
        let ns_clone = component.namespace().clone();
97
98
99
        tokio::spawn(async move {
            let mut event_rx = event_rx;
            while let Some(event) = event_rx.recv().await {
100
                if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
101
102
103
104
105
                    tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
                }
            }
        });

106
107
108
109
110
111
        let worker_ids: Vec<i64> = instances
            .iter()
            .map(|instance| instance.instance_id)
            .collect();
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
            component,
112
            block_size as usize,
113
            worker_ids,
114
115
            replica_sync,
        ));
116

117
        let slots_clone = slots.clone();
118
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
119
120
121
        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
122
            tracing::trace!("scheduler background task started");
123
124
            let mut workers_with_configs: HashMap<i64, Option<ModelRuntimeConfig>> = HashMap::new();
            let mut needs_rebuild = true;
125

126
            loop {
127
128
129
130
131
                // Check for instance updates (non-blocking)
                let instances_changed = instances_rx.has_changed();
                let configs_changed = runtime_configs_rx.has_changed();

                match instances_changed {
132
                    Ok(true) => {
133
                        instances = instances_rx.borrow_and_update().clone();
134
135
136
137
138
                        let worker_ids: Vec<i64> = instances
                            .iter()
                            .map(|instance| instance.instance_id)
                            .collect();
                        slots_clone.update_workers(worker_ids);
139
                        needs_rebuild = true;
140
                    }
141
                    Ok(false) => {}
142
143
144
145
146
                    Err(_) => {
                        tracing::warn!("endpoint watch sender shutdown");
                        break;
                    }
                }
147

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
                // Check for runtime config updates
                match configs_changed {
                    Ok(true) => {
                        runtime_configs = runtime_configs_rx.borrow_and_update().clone();
                        needs_rebuild = true;
                    }
                    Ok(false) => {}
                    Err(_) => {
                        tracing::warn!("runtime configs watch sender shutdown");
                    }
                }

                // Rebuild workers hashmap only when needed
                if needs_rebuild {
                    workers_with_configs.clear();
                    for instance in &instances {
                        let worker_id = instance.instance_id;
                        let config = runtime_configs.get(&worker_id).cloned();
                        if config.is_none() {
                            tracing::warn!("Runtime config not found for worker_id: {}", worker_id);
                        }
                        workers_with_configs.insert(worker_id, config);
                    }
                    needs_rebuild = false;
                }

                // Wait for a new request
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

191
                match selector.select_worker(&workers_with_configs, &request, block_size) {
192
193
194
195
196
197
198
                    Ok(selection) => {
                        if let Err(e) = event_tx.send(KVHitRateEvent {
                            worker_id: selection.worker_id,
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
                        }) {
                            tracing::warn!("Failed to send KV hit rate event: {:?}", e);
199
                        }
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232

                        let response = SchedulingResponse {
                            best_worker_id: selection.worker_id,
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

                        let _ = slots_clone
                            .add_request(
                                request.request_id,
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
                                selection.worker_id,
                            )
                            .await;

                        continue;
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
233
234
235
236
                    }
                }
            }

237
            tracing::trace!("background endpoint subscriber shutting down");
238
239
        });

240
        Ok(KvScheduler { request_tx, slots })
241
242
243
244
    }

    pub async fn schedule(
        &self,
245
        request_id: String,
246
        isl_tokens: usize,
247
        token_seq: Vec<SequenceHash>,
248
        overlaps: OverlapScores,
249
        router_config_override: Option<&RouterConfigOverride>,
GuanLuo's avatar
GuanLuo committed
250
    ) -> Result<i64, KvSchedulerError> {
251
252
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
253
254
            request_id,
            token_seq,
255
            isl_tokens,
256
            overlaps,
257
258
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
259
            router_config_override: router_config_override.cloned(),
260
            resp_tx: Some(resp_tx), // Wrap in Some()
261
        };
262

263
264
265
266
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
267
        let response = resp_rx
268
269
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
270

271
272
        let best_worker_id = response.best_worker_id;
        Ok(best_worker_id)
273
274
    }

275
276
277
278
279
    pub async fn mark_prefill_completed(&self, request_id: &str) {
        let _ = self
            .slots
            .mark_prefill_completed(&request_id.to_string())
            .await;
280
281
    }

282
283
    pub async fn free(&self, request_id: &str) {
        let _ = self.slots.free(&request_id.to_string()).await;
284
    }
285
286
}

287
288
289
290
291
292
// Helper function for softmax sampling
fn softmax_sample(logits: &HashMap<i64, f64>, temperature: f64) -> i64 {
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

293
294
295
296
297
298
299
300
    // Guard: if temperature is 0, return the key with the smallest logit value
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
301
            .filter(|&(_, &v)| v == min_logit)
302
303
304
305
306
307
308
309
310
            .map(|(k, _)| *k)
            .collect();

        // Randomly select from the minimum keys (handles single key case naturally)
        let mut rng = rand::rng();
        let index = rng.random_range(0..min_keys.len());
        return min_keys[index];
    }

311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
327
328
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
            return keys[i];
        }
    }

    // Fallback to last key (shouldn't normally reach here)
    keys[keys.len() - 1]
}

359
// Default implementation matching the Python _cost_function
360
361
362
363
364
365
366
367
368
369
370
371
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
372
373
374
375

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
376
        workers: &HashMap<i64, Option<ModelRuntimeConfig>>,
377
        request: &SchedulingRequest,
378
        block_size: u32,
379
380
381
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

382
        if workers.is_empty() {
383
384
385
            return Err(KvSchedulerError::NoEndpoints);
        }

386
387
388
389
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

390
391
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
392

393
        let mut worker_logits = HashMap::new();
394
        let mut max_logit = f64::NEG_INFINITY;
395

396
        // Calculate logits for each worker
397
398
        for worker_id in workers.keys() {
            let overlap = *overlaps.get(worker_id).unwrap_or(&0);
399
400

            // this is the number of prefill tokens the worker would have if the request were scheduled there
401
            let prefill_token = *prefill_tokens.get(worker_id).unwrap_or(&isl);
402
403
404
405
            let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

            // this is the number of decode blocks the worker would have if the request were scheduled there
            let decode_block = *decode_blocks
406
                .get(worker_id)
407
408
                .unwrap_or(&(potential_prefill_block.floor() as usize))
                as f64;
409

410
411
412
413
414
415
416
            // Use override if provided, otherwise use default config
            let overlap_weight = request
                .router_config_override
                .as_ref()
                .and_then(|cfg| cfg.overlap_score_weight)
                .unwrap_or(self.kv_router_config.overlap_score_weight);

417
            // Calculate logit (lower is better)
418
            let logit = overlap_weight * potential_prefill_block + decode_block;
419
            max_logit = max_logit.max(logit);
420

421
            worker_logits.insert(*worker_id, logit);
422
423

            tracing::info!(
424
425
426
                "Formula for {worker_id} with {overlap} cached blocks: {logit:.3} \
                 = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                 = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}"
427
428
429
            );
        }

430
        // Use softmax sampling to select worker
431
432
433
434
435
436
        // Use override if provided, otherwise use default config
        let temperature = request
            .router_config_override
            .as_ref()
            .and_then(|cfg| cfg.router_temperature)
            .unwrap_or(self.kv_router_config.router_temperature);
437
438
439
        let best_worker_id = softmax_sample(&worker_logits, temperature);
        let best_logit = worker_logits[&best_worker_id];

440
441
442
443
444
445
446
447
        let best_overlap = *overlaps.get(&best_worker_id).unwrap_or(&0);
        let total_blocks_info = workers
            .get(&best_worker_id)
            .and_then(|cfg| cfg.as_ref())
            .and_then(|cfg| cfg.total_kv_blocks)
            .map(|blocks| format!(", total blocks: {}", blocks))
            .unwrap_or_default();

448
        tracing::info!(
449
            "Selected worker: {}, logit: {:.3}, cached blocks: {}{}",
450
            best_worker_id,
451
452
453
            best_logit,
            best_overlap,
            total_blocks_info
454
        );
455
456

        Ok(WorkerSelectionResult {
457
458
            worker_id: best_worker_id,
            required_blocks: request_blocks as u64,
459
            overlap_blocks: overlaps.get(&best_worker_id).copied().unwrap_or(0),
460
        })
461
462
    }
}
463
464
465
466
467

#[cfg(test)]
mod tests {
    use super::*;

468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
        let worker_id = 42;
        logits.insert(worker_id, 0.5); // The value doesn't matter

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
            assert_eq!(result, worker_id, "Should return the only available worker");
        }

        // Test with different logit values
        logits.clear();
        logits.insert(worker_id, -100.0); // Very negative value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 100.0); // Very positive value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 0.0); // Zero value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);
    }

495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
    #[test]
    fn test_softmax_sample_zero_temperature() {
        // Test that with temperature 0, softmax_sample returns the key with smallest logit
        let mut logits = HashMap::new();
        logits.insert(1, 5.0);
        logits.insert(2, 3.0); // This has the smallest logit
        logits.insert(3, 7.0);
        logits.insert(4, 3.5);

        // With temperature 0, should always return worker 2 (smallest logit)
        for _ in 0..10 {
            let result = softmax_sample(&logits, 0.0);
            assert_eq!(
                result, 2,
                "Should return worker with smallest logit when temperature is 0"
510
511
512
            );
        }

513
514
515
516
517
        // Test with negative values
        logits.clear();
        logits.insert(10, -1.0);
        logits.insert(20, -5.0); // This has the smallest logit
        logits.insert(30, 0.0);
518

519
520
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(result, 20, "Should handle negative logits correctly");
521
522
    }
}