scheduler.rs 16.2 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
3
4

use dynamo_runtime::component::{Component, Instance};
Neelay Shah's avatar
Neelay Shah committed
5
use dynamo_runtime::traits::events::EventPublisher;
6
use rand::Rng;
7
use serde::{Deserialize, Serialize};
8
use std::collections::HashMap;
9
10
use std::sync::Arc;
use std::time::Duration;
11

12
use super::indexer::OverlapScores;
13
use super::protocols::WorkerSelectionResult;
14
15
use super::sequence::ActiveSequencesMultiWorker;
use super::KvRouterConfig;
16
use super::WorkerSelector;
17
18
use super::KV_HIT_RATE_SUBJECT;

19
use crate::tokens::SequenceHash;
20

21
22
23
24
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
    pub worker_id: i64,
    pub isl_blocks: usize,
25
    pub overlap_blocks: u32,
26
27
}

28
29
30
31
32
33
34
35
36
37
38
39
#[derive(Debug, thiserror::Error)]
pub enum KvSchedulerError {
    #[error("no endpoints aviailable to route work")]
    NoEndpoints,

    #[error("all workers busy")]
    AllWorkersBusy,

    #[error("endpoint subscriber shutdown")]
    SubscriberShutdown,
}

40
41
42
#[derive(Debug)]
pub struct SchedulingResponse {
    pub best_worker_id: i64,
43
    pub overlap_blocks: u32,
44
45
}

46
pub struct SchedulingRequest {
47
48
    pub request_id: String,
    pub token_seq: Vec<SequenceHash>,
49
    pub isl_tokens: usize,
50
    pub overlaps: OverlapScores,
51
52
53
54
    pub decode_blocks: HashMap<i64, usize>,
    pub prefill_tokens: HashMap<i64, usize>,
    // Option to take it out to send the response without moving the struct
    resp_tx: Option<tokio::sync::oneshot::Sender<SchedulingResponse>>,
55
56
57
}

impl SchedulingRequest {
58
59
60
61
62
63
64
65
66
    pub fn respond(&mut self, response: SchedulingResponse) {
        // Changed to &mut self
        if let Some(tx) = self.resp_tx.take() {
            // Use take() to extract the sender
            if tx.send(response).is_err() {
                tracing::error!("failed to send response to requestor");
            }
        } else {
            tracing::error!("respond called multiple times on same request");
67
68
69
70
71
72
        }
    }
}

pub struct KvScheduler {
    request_tx: tokio::sync::mpsc::Sender<SchedulingRequest>,
73
    slots: Arc<ActiveSequencesMultiWorker>,
74
75
76
77
}

impl KvScheduler {
    pub async fn start(
78
        component: Component,
79
        block_size: u32,
80
        mut instances_rx: tokio::sync::watch::Receiver<Vec<Instance>>, // Changed from ProcessedEndpoints
81
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
82
        replica_sync: bool,
83
    ) -> Result<Self, KvSchedulerError> {
84
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
85
86
        let mut instances: Vec<Instance> = instances_rx.borrow_and_update().clone();

87
        let (event_tx, event_rx) = tokio::sync::mpsc::unbounded_channel::<KVHitRateEvent>();
88
        let ns_clone = component.namespace().clone();
89
90
91
        tokio::spawn(async move {
            let mut event_rx = event_rx;
            while let Some(event) = event_rx.recv().await {
92
                if let Err(e) = ns_clone.publish(KV_HIT_RATE_SUBJECT, &event).await {
93
94
95
96
97
                    tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
                }
            }
        });

98
99
100
101
102
103
        let worker_ids: Vec<i64> = instances
            .iter()
            .map(|instance| instance.instance_id)
            .collect();
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
            component,
104
            block_size as usize,
105
            worker_ids,
106
107
            replica_sync,
        ));
108

109
        let slots_clone = slots.clone();
110
        let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
111
112
113
        // Background task to handle scheduling requests
        tokio::spawn(async move {
            let mut request_rx = request_rx;
114
            tracing::trace!("scheduler background task started");
115

116
117
118
119
            loop {
                // First, check for instance updates (non-blocking)
                match instances_rx.has_changed() {
                    Ok(true) => {
120
                        instances = instances_rx.borrow_and_update().clone();
121
122
123
124
125
                        let worker_ids: Vec<i64> = instances
                            .iter()
                            .map(|instance| instance.instance_id)
                            .collect();
                        slots_clone.update_workers(worker_ids);
126
                    }
127
128
                    Ok(false) => {
                        // No changes, continue. This is the happy path.
129
                    }
130
131
132
133
134
                    Err(_) => {
                        tracing::warn!("endpoint watch sender shutdown");
                        break;
                    }
                }
135

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
                // Then, wait for a new request
                let Some(mut request) = request_rx.recv().await else {
                    tracing::warn!("scheduler shutdown");
                    break;
                };
                tracing::trace!("received request to be scheduled");

                let (decode_blocks, prefill_tokens) = slots_clone
                    .potential_blocks_and_tokens(
                        request.token_seq.clone(),
                        request.isl_tokens,
                        request.overlaps.clone(),
                    )
                    .await;
                request.decode_blocks = decode_blocks;
                request.prefill_tokens = prefill_tokens;

                match selector.select_worker(&instances, &request, block_size) {
                    Ok(selection) => {
                        if let Err(e) = event_tx.send(KVHitRateEvent {
                            worker_id: selection.worker_id,
                            isl_blocks: selection.required_blocks as usize,
                            overlap_blocks: selection.overlap_blocks,
                        }) {
                            tracing::warn!("Failed to send KV hit rate event: {:?}", e);
161
                        }
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

                        let response = SchedulingResponse {
                            best_worker_id: selection.worker_id,
                            overlap_blocks: selection.overlap_blocks,
                        };
                        request.respond(response);

                        let _ = slots_clone
                            .add_request(
                                request.request_id,
                                request.token_seq,
                                request.isl_tokens,
                                selection.overlap_blocks,
                                selection.worker_id,
                            )
                            .await;

                        continue;
                    }
                    Err(KvSchedulerError::NoEndpoints) => {
                        tracing::trace!("no endpoints available; waiting for endpoints update");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    // TODO: this is not actually hooked up
                    Err(KvSchedulerError::AllWorkersBusy) => {
                        tracing::trace!("all workers busy; waiting for more capacity");
                        tokio::time::sleep(Duration::from_millis(5)).await;
                        continue;
                    }
                    Err(e) => {
                        tracing::error!("error scheduling request: {:?}", e);
                        break;
195
196
197
198
                    }
                }
            }

199
            tracing::trace!("background endpoint subscriber shutting down");
200
201
        });

202
        Ok(KvScheduler { request_tx, slots })
203
204
205
206
    }

    pub async fn schedule(
        &self,
207
        request_id: String,
208
        isl_tokens: usize,
209
        token_seq: Vec<SequenceHash>,
210
        overlaps: OverlapScores,
GuanLuo's avatar
GuanLuo committed
211
    ) -> Result<i64, KvSchedulerError> {
212
213
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
        let request = SchedulingRequest {
214
215
            request_id,
            token_seq,
216
            isl_tokens,
217
            overlaps,
218
219
220
            decode_blocks: HashMap::new(),
            prefill_tokens: HashMap::new(),
            resp_tx: Some(resp_tx), // Wrap in Some()
221
        };
222

223
224
225
226
        self.request_tx
            .send(request)
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
227
        let response = resp_rx
228
229
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?;
230

231
232
        let best_worker_id = response.best_worker_id;
        Ok(best_worker_id)
233
234
    }

235
236
237
238
239
    pub async fn mark_prefill_completed(&self, request_id: &str) {
        let _ = self
            .slots
            .mark_prefill_completed(&request_id.to_string())
            .await;
240
241
    }

242
243
    pub async fn free(&self, request_id: &str) {
        let _ = self.slots.free(&request_id.to_string()).await;
244
    }
245
246
}

247
248
249
250
251
252
// Helper function for softmax sampling
fn softmax_sample(logits: &HashMap<i64, f64>, temperature: f64) -> i64 {
    if logits.is_empty() {
        panic!("Empty logits for softmax sampling");
    }

253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    // Guard: if temperature is 0, return the key with the smallest logit value
    if temperature == 0.0 {
        // Find the minimum logit value
        let min_logit = logits.values().fold(f64::INFINITY, |a, &b| a.min(b));

        // Collect all keys with the minimum logit value (to handle ties)
        let min_keys: Vec<_> = logits
            .iter()
            .filter(|(_, &v)| v == min_logit)
            .map(|(k, _)| *k)
            .collect();

        // Randomly select from the minimum keys (handles single key case naturally)
        let mut rng = rand::rng();
        let index = rng.random_range(0..min_keys.len());
        return min_keys[index];
    }

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
    let keys: Vec<_> = logits.keys().copied().collect();
    let values: Vec<_> = logits.values().copied().collect();

    // Find min and max for normalization
    let min_val = values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
    let max_val = values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));

    let probabilities = if min_val == max_val {
        // All values are the same, uniform probability
        vec![1.0 / keys.len() as f64; keys.len()]
    } else {
        // Normalize values
        let normalized: Vec<_> = values
            .iter()
            .map(|&v| {
                // Lower is better, so negate
287
288
                // Note we don't need to do actual min-max norm here, just off by an offset
                let norm = v / (max_val - min_val);
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
                -norm
            })
            .collect();

        // Apply temperature and softmax
        let scaled: Vec<_> = normalized.iter().map(|&v| v / temperature).collect();

        let max_scaled = scaled.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
        let exp_values: Vec<_> = scaled.iter().map(|&v| (v - max_scaled).exp()).collect();

        let sum_exp: f64 = exp_values.iter().sum();
        exp_values.iter().map(|&v| v / sum_exp).collect()
    };

    // Sample from the probability distribution
    let mut rng = rand::rng();
    let sample: f64 = rng.random();

    let mut cumsum = 0.0;
    for (i, &prob) in probabilities.iter().enumerate() {
        cumsum += prob;
        if sample <= cumsum {
            return keys[i];
        }
    }

    // Fallback to last key (shouldn't normally reach here)
    keys[keys.len() - 1]
}

319
// Default implementation matching the Python _cost_function
320
321
322
323
324
325
326
327
328
329
330
331
#[derive(Debug, Clone, Default)]
pub struct DefaultWorkerSelector {
    pub kv_router_config: KvRouterConfig,
}

impl DefaultWorkerSelector {
    pub fn new(kv_router_config: Option<KvRouterConfig>) -> Self {
        Self {
            kv_router_config: kv_router_config.unwrap_or_default(),
        }
    }
}
332
333
334
335

impl WorkerSelector for DefaultWorkerSelector {
    fn select_worker(
        &self,
336
        workers: &[Instance],
337
        request: &SchedulingRequest,
338
        block_size: u32,
339
340
341
    ) -> Result<WorkerSelectionResult, KvSchedulerError> {
        assert!(request.isl_tokens > 0);

342
        if workers.is_empty() {
343
344
345
            return Err(KvSchedulerError::NoEndpoints);
        }

346
347
348
349
        let isl = request.isl_tokens;
        let request_blocks = isl.div_ceil(block_size as usize);
        let overlaps = &request.overlaps.scores;

350
351
        let decode_blocks = &request.decode_blocks;
        let prefill_tokens = &request.prefill_tokens;
352

353
        let mut worker_logits = HashMap::new();
354
        let mut max_logit = f64::NEG_INFINITY;
355

356
        // Calculate logits for each worker
357
358
        for instance in workers.iter() {
            let worker_id = instance.instance_id;
359
360
361
362
363
364
365
366
367
368
369
            let overlap = *overlaps.get(&worker_id).unwrap_or(&0);

            // this is the number of prefill tokens the worker would have if the request were scheduled there
            let prefill_token = *prefill_tokens.get(&worker_id).unwrap_or(&isl);
            let potential_prefill_block = (prefill_token as f64) / (block_size as f64);

            // this is the number of decode blocks the worker would have if the request were scheduled there
            let decode_block = *decode_blocks
                .get(&worker_id)
                .unwrap_or(&(potential_prefill_block.floor() as usize))
                as f64;
370

371
            // Calculate logit (lower is better)
372
373
            let logit =
                self.kv_router_config.overlap_score_weight * potential_prefill_block + decode_block;
374
            max_logit = max_logit.max(logit);
375

376
            worker_logits.insert(worker_id, logit);
377

378
            let overlap_weight = self.kv_router_config.overlap_score_weight;
379
            tracing::info!(
380
381
382
                "Formula for {worker_id} with {overlap} cached blocks: {logit:.3} \
                 = {overlap_weight:.1} * prefill_blocks + decode_blocks \
                 = {overlap_weight:.1} * {potential_prefill_block:.3} + {decode_block:.3}"
383
384
385
            );
        }

386
        // Use softmax sampling to select worker
387
        let temperature = self.kv_router_config.router_temperature;
388
389
390
391
        let best_worker_id = softmax_sample(&worker_logits, temperature);
        let best_logit = worker_logits[&best_worker_id];

        tracing::info!(
392
            "Selected worker: {}, logit: {:.3}",
393
394
395
            best_worker_id,
            best_logit
        );
396
397

        Ok(WorkerSelectionResult {
398
399
            worker_id: best_worker_id,
            required_blocks: request_blocks as u64,
400
            overlap_blocks: overlaps.get(&best_worker_id).copied().unwrap_or(0),
401
        })
402
403
    }
}
404
405
406
407
408

#[cfg(test)]
mod tests {
    use super::*;

409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
    #[test]
    fn test_softmax_sample_single_key() {
        // Test that with a single key, softmax_sample always returns that key
        let mut logits = HashMap::new();
        let worker_id = 42;
        logits.insert(worker_id, 0.5); // The value doesn't matter

        // Test with different temperatures
        for temperature in &[0.1, 1.0, 10.0] {
            let result = softmax_sample(&logits, *temperature);
            assert_eq!(result, worker_id, "Should return the only available worker");
        }

        // Test with different logit values
        logits.clear();
        logits.insert(worker_id, -100.0); // Very negative value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 100.0); // Very positive value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);

        logits.clear();
        logits.insert(worker_id, 0.0); // Zero value
        assert_eq!(softmax_sample(&logits, 1.0), worker_id);
    }

436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
    #[test]
    fn test_softmax_sample_zero_temperature() {
        // Test that with temperature 0, softmax_sample returns the key with smallest logit
        let mut logits = HashMap::new();
        logits.insert(1, 5.0);
        logits.insert(2, 3.0); // This has the smallest logit
        logits.insert(3, 7.0);
        logits.insert(4, 3.5);

        // With temperature 0, should always return worker 2 (smallest logit)
        for _ in 0..10 {
            let result = softmax_sample(&logits, 0.0);
            assert_eq!(
                result, 2,
                "Should return worker with smallest logit when temperature is 0"
451
452
453
            );
        }

454
455
456
457
458
        // Test with negative values
        logits.clear();
        logits.insert(10, -1.0);
        logits.insert(20, -5.0); // This has the smallest logit
        logits.insert(30, 0.0);
459

460
461
        let result = softmax_sample(&logits, 0.0);
        assert_eq!(result, 20, "Should handle negative logits correctly");
462
463
    }
}