"examples/offline_inference/vision_language_pooling.py" did not exist on "e7523c2e031bc96740723ab63833d1cf94229ab4"
local.rs 34.5 KB
Newer Older
1
2
3
4
5
6
7
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use std::time::Duration;

8
9
use rustc_hash::{FxHashMap, FxHashSet};
use tokio::sync::watch;
10
use tokio::time::Instant;
11
12
13
use tokio_util::sync::CancellationToken;

use super::policy::{RouterSchedulingPolicy, SchedulingPolicy};
14
use super::prefill_load::PrefillLoadEstimator;
15
16
use super::queue::SchedulerQueue;
use super::selector::{DefaultWorkerSelector, WorkerSelector};
17
18
19
20
use super::types::{
    KvSchedulerError, PotentialLoad, SchedulingRequest, SchedulingResponse, TierOverlapBlocks,
};
use crate::protocols::{WorkerConfigLike, WorkerId, WorkerWithDpRank};
21
22
23
24
25
26
27
28
29
30
31
32
33
34
use crate::sequences::{
    ActiveSequencesMultiWorker, SequenceError, SequencePublisher, SequenceRequest,
};
use dynamo_tokens::SequenceHash;

pub struct LocalScheduler<P, C, S = RouterSchedulingPolicy, Sel = DefaultWorkerSelector>
where
    P: SequencePublisher,
    C: WorkerConfigLike,
    S: SchedulingPolicy,
    Sel: WorkerSelector<C>,
{
    slots: Arc<ActiveSequencesMultiWorker<P>>,
    queue: Arc<SchedulerQueue<P, C, S, Sel>>,
35
    queue_updates: watch::Sender<()>,
36
    track_prefill_tokens_default: bool,
37
38
39
40
41
42
43
44
45
46
    worker_type: &'static str,
}

impl<P, C, S, Sel> LocalScheduler<P, C, S, Sel>
where
    P: SequencePublisher + 'static,
    C: WorkerConfigLike + Clone + PartialEq + Send + Sync + 'static,
    S: SchedulingPolicy + 'static,
    Sel: WorkerSelector<C> + Send + Sync + 'static,
{
47
48
49
50
51
52
53
54
55
56
57
58
    fn worker_dp_range(workers: &HashMap<WorkerId, C>) -> HashMap<WorkerId, (u32, u32)> {
        workers
            .iter()
            .map(|(&id, cfg)| {
                (
                    id,
                    (cfg.data_parallel_start_rank(), cfg.data_parallel_size()),
                )
            })
            .collect()
    }

59
60
61
62
63
64
65
66
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        slots: Arc<ActiveSequencesMultiWorker<P>>,
        workers_with_configs: watch::Receiver<HashMap<WorkerId, C>>,
        threshold_frac: Option<f64>,
        block_size: u32,
        selector: Sel,
        policy: S,
67
68
        prefill_load_estimator: Option<Arc<dyn PrefillLoadEstimator>>,
        recheck_interval: Duration,
69
        track_prefill_tokens_default: bool,
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
        cancellation_token: CancellationToken,
        worker_type: &'static str,
        monitor_worker_configs: bool,
    ) -> Self {
        if monitor_worker_configs {
            let slots_monitor = Arc::clone(&slots);
            let mut monitor_rx = workers_with_configs.clone();
            let mut last_workers = monitor_rx.borrow().clone();
            let monitor_cancel_token = cancellation_token.clone();
            tokio::spawn(async move {
                tracing::trace!("LocalScheduler workers monitoring task started");

                loop {
                    tokio::select! {
                        _ = monitor_cancel_token.cancelled() => {
                            tracing::trace!("LocalScheduler workers monitoring task shutting down");
                            break;
                        }
                        result = monitor_rx.changed() => {
                            if result.is_err() {
                                tracing::warn!("LocalScheduler worker config watch dropped, shutting down");
                                break;
                            }
                        }
                    }

                    let current_workers = monitor_rx.borrow_and_update().clone();
                    if current_workers == last_workers {
                        continue;
                    }

101
                    let dp_range = Self::worker_dp_range(&current_workers);
102
103
104
105
106
107
108
109
110
111
112
113
114
                    slots_monitor.update_workers(&dp_range);
                    last_workers = current_workers;
                }
            });
        }

        let queue = Arc::new(SchedulerQueue::new(
            Arc::clone(&slots),
            workers_with_configs,
            threshold_frac,
            block_size,
            selector,
            policy,
115
            prefill_load_estimator,
116
        ));
117
118
        let (queue_updates, _) = watch::channel(());
        let queue_remote_updates = Arc::clone(&queue);
119
        let queue_periodic_updates = Arc::clone(&queue);
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
        let mut remote_state_updates = slots.subscribe_remote_state_changes();
        let remote_update_cancel_token = cancellation_token.clone();
        let queue_updates_remote = queue_updates.clone();

        tokio::spawn(async move {
            tracing::trace!("LocalScheduler remote state listener started");

            loop {
                tokio::select! {
                    _ = remote_update_cancel_token.cancelled() => {
                        tracing::trace!("LocalScheduler remote state listener shutting down");
                        break;
                    }
                    result = remote_state_updates.changed() => {
                        if result.is_err() {
                            tracing::trace!("LocalScheduler remote state listener shutting down");
                            break;
                        }
                        queue_remote_updates.update().await;
                        let _ = queue_updates_remote.send(());
                    }
                }
            }
        });
144
145

        tokio::spawn(async move {
146
            let mut recheck_interval = tokio::time::interval(recheck_interval);
147
            tracing::trace!("LocalScheduler periodic queue update task started");
148
149
150
151

            loop {
                tokio::select! {
                    _ = cancellation_token.cancelled() => {
152
                        tracing::trace!("LocalScheduler periodic queue update task shutting down");
153
154
155
                        break;
                    }
                    _ = recheck_interval.tick() => {
156
                        queue_periodic_updates.update().await;
157
158
159
160
161
162
163
164
                    }
                }
            }
        });

        Self {
            slots,
            queue,
165
            queue_updates,
166
            track_prefill_tokens_default,
167
168
169
170
171
172
173
174
175
176
            worker_type,
        }
    }

    #[expect(clippy::too_many_arguments)]
    pub async fn schedule(
        &self,
        maybe_request_id: Option<String>,
        isl_tokens: usize,
        token_seq: Option<Vec<SequenceHash>>,
177
178
179
180
        tier_overlap_blocks: TierOverlapBlocks,
        effective_overlap_blocks: HashMap<WorkerWithDpRank, f64>,
        effective_cached_tokens: HashMap<WorkerWithDpRank, usize>,
        tree_sizes: HashMap<WorkerWithDpRank, usize>,
181
182
183
184
185
        router_config_override: Option<&super::config::RouterConfigOverride>,
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
        expected_output_tokens: Option<u32>,
186
        pinned_worker: Option<WorkerWithDpRank>,
187
        allowed_worker_ids: Option<HashSet<WorkerId>>,
188
        shared_cache_hits: Option<crate::SharedCacheHits>,
189
190
    ) -> Result<SchedulingResponse, KvSchedulerError> {
        let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
191
192
193
        let track_prefill_tokens = router_config_override
            .and_then(|cfg| cfg.track_prefill_tokens)
            .unwrap_or(self.track_prefill_tokens_default);
194
195
196
197
        let request = SchedulingRequest {
            maybe_request_id,
            token_seq,
            isl_tokens,
198
199
200
201
            tier_overlap_blocks,
            effective_overlap_blocks,
            effective_cached_tokens,
            tree_sizes,
202
203
            decode_blocks: FxHashMap::default(),
            prefill_tokens: FxHashMap::default(),
204
            track_prefill_tokens,
205
206
207
208
209
            router_config_override: router_config_override.cloned(),
            update_states,
            lora_name,
            priority_jump,
            expected_output_tokens,
210
            pinned_worker,
211
            allowed_worker_ids,
212
            shared_cache_hits,
213
214
215
            resp_tx: Some(resp_tx),
        };

216
        self.queue.enqueue(request).await;
217
218
219
220
221
222
223
224
225
226
227

        resp_rx
            .await
            .map_err(|_| KvSchedulerError::SubscriberShutdown)?
    }

    pub fn register_workers(&self, worker_ids: &HashSet<WorkerId>) {
        self.queue.register_workers(worker_ids);
    }

    pub async fn add_request(&self, req: SequenceRequest) -> Result<(), SequenceError> {
228
        self.slots.add_request(req, Instant::now())
229
230
231
    }

    pub async fn mark_prefill_completed(&self, request_id: &str) -> Result<(), SequenceError> {
232
233
        self.slots
            .mark_prefill_completed(&request_id.to_string(), Instant::now())?;
234
235
236
237
238
        self.queue.update().await;
        Ok(())
    }

    pub async fn free(&self, request_id: &str) -> Result<(), SequenceError> {
239
        self.slots.free(&request_id.to_string(), Instant::now())?;
240
241
242
243
244
245
246
247
        self.queue.update().await;
        Ok(())
    }

    pub fn pending_count(&self) -> usize {
        self.queue.pending_count()
    }

248
249
250
251
    pub fn pending_isl_tokens(&self) -> usize {
        self.queue.pending_isl_tokens()
    }

252
253
254
255
    pub fn worker_type(&self) -> &'static str {
        self.worker_type
    }

256
257
258
259
    pub fn subscribe_queue_updates(&self) -> watch::Receiver<()> {
        self.queue_updates.subscribe()
    }

260
261
262
263
264
265
266
267
268
269
270
271
272
    pub fn add_output_block(
        &self,
        request_id: &str,
        decay_fraction: Option<f64>,
    ) -> Result<(), SequenceError> {
        self.slots
            .add_output_block(&request_id.to_string(), decay_fraction)
    }

    pub fn get_potential_loads(
        &self,
        token_seq: Option<Vec<SequenceHash>>,
        isl_tokens: usize,
273
        effective_cached_tokens: HashMap<WorkerWithDpRank, usize>,
274
        track_prefill_tokens: bool,
275
    ) -> Vec<PotentialLoad> {
276
        let decay_now = Instant::now();
277
278
279
280
281
        let (decode_blocks, prefill_tokens) = self
            .slots
            .potential_blocks_and_tokens_with_prefill_tracking(
                token_seq.as_deref(),
                isl_tokens,
282
                effective_cached_tokens,
283
                track_prefill_tokens,
284
                decay_now,
285
            );
286

287
        let mut workers: FxHashSet<WorkerWithDpRank> = FxHashSet::default();
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
        workers.extend(decode_blocks.keys().copied());
        workers.extend(prefill_tokens.keys().copied());

        let mut loads = Vec::with_capacity(workers.len());
        for worker in workers {
            loads.push(PotentialLoad {
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
                potential_prefill_tokens: prefill_tokens
                    .get(&worker)
                    .copied()
                    .unwrap_or(isl_tokens),
                potential_decode_blocks: decode_blocks.get(&worker).copied().unwrap_or(0),
            });
        }

        loads
    }

    pub fn get_active_lora_counts(&self) -> HashMap<String, usize> {
        self.slots.get_active_lora_counts()
    }
}

#[cfg(test)]
mod tests {
    use std::collections::HashMap;
    use std::sync::Arc;
    use std::time::Duration;

318
    use tokio::sync::{mpsc, watch};
319
320

    use super::*;
321
    use crate::protocols::{ActiveSequenceEvent, ActiveSequenceEventData};
322
    use crate::scheduling::PrefillLoadEstimator;
323
324
    use crate::scheduling::policy::FcfsPolicy;
    use crate::scheduling::selector::DefaultWorkerSelector;
325
    use crate::sequences::SequenceSubscriber;
326
327
    use crate::test_utils::{NoopSequencePublisher, SimpleWorkerConfig};

328
329
330
331
332
333
334
335
336
337
    struct TestSequenceSubscriber {
        rx: mpsc::UnboundedReceiver<ActiveSequenceEvent>,
    }

    impl SequenceSubscriber for TestSequenceSubscriber {
        async fn next_event(&mut self) -> Option<anyhow::Result<ActiveSequenceEvent>> {
            self.rx.recv().await.map(Ok)
        }
    }

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
    struct FixedPrefillLoadEstimator {
        duration: Duration,
    }

    impl PrefillLoadEstimator for FixedPrefillLoadEstimator {
        fn predict_prefill_duration(
            &self,
            _batch_size: usize,
            _effective_isl: usize,
            _prefix: usize,
        ) -> anyhow::Result<Duration> {
            Ok(self.duration)
        }
    }

353
354
355
356
357
    #[allow(clippy::type_complexity)]
    fn make_scheduler(
        workers: HashMap<WorkerId, SimpleWorkerConfig>,
        threshold_frac: Option<f64>,
        monitor_worker_configs: bool,
358
        prefill_load_estimator: Option<Arc<dyn PrefillLoadEstimator>>,
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
    ) -> (
        Arc<LocalScheduler<NoopSequencePublisher, SimpleWorkerConfig, FcfsPolicy>>,
        Arc<ActiveSequencesMultiWorker<NoopSequencePublisher>>,
        watch::Sender<HashMap<WorkerId, SimpleWorkerConfig>>,
        CancellationToken,
    ) {
        let dp_range = workers
            .iter()
            .map(|(&id, cfg)| (id, (cfg.data_parallel_start_rank, cfg.data_parallel_size)))
            .collect();
        let slots = Arc::new(ActiveSequencesMultiWorker::new(
            NoopSequencePublisher,
            64,
            dp_range,
            false,
            0,
            "test",
        ));
        let (cfg_tx, cfg_rx) = watch::channel(workers);
        let cancel_token = CancellationToken::new();
        let scheduler = Arc::new(LocalScheduler::new(
            Arc::clone(&slots),
            cfg_rx,
            threshold_frac,
            64,
            DefaultWorkerSelector::new(None, "test"),
            FcfsPolicy,
386
387
            prefill_load_estimator,
            Duration::from_secs(60),
388
            true,
389
390
391
392
393
394
395
            cancel_token.clone(),
            "test",
            monitor_worker_configs,
        ));
        (scheduler, slots, cfg_tx, cancel_token)
    }

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
    fn start_replica_sync(
        slots: &Arc<ActiveSequencesMultiWorker<NoopSequencePublisher>>,
        cancel_token: &CancellationToken,
    ) -> mpsc::UnboundedSender<ActiveSequenceEvent> {
        let (tx, rx) = mpsc::unbounded_channel();
        slots.start_replica_sync(TestSequenceSubscriber { rx }, cancel_token.clone());
        tx
    }

    async fn wait_for_pending_count(
        scheduler: &Arc<LocalScheduler<NoopSequencePublisher, SimpleWorkerConfig, FcfsPolicy>>,
        expected: usize,
    ) {
        tokio::time::timeout(Duration::from_millis(250), async {
            loop {
                if scheduler.pending_count() == expected {
                    break;
                }
                tokio::time::sleep(Duration::from_millis(5)).await;
            }
        })
        .await
        .unwrap();
    }

421
422
423
424
425
426
427
428
429
430
    #[tokio::test]
    async fn test_schedule_books_request_into_active_sequences() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
431
        let (scheduler, _slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
432
433
434
435
436
437

        let response = scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
438
439
440
441
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
442
443
444
445
446
447
                None,
                true,
                Some("adapter-a".to_string()),
                0.0,
                None,
                None,
448
                None,
449
                None,
450
451
452
453
454
455
456
457
458
459
460
461
462
            )
            .await
            .unwrap();

        assert_eq!(response.best_worker.worker_id, 0);
        assert_eq!(
            scheduler.get_active_lora_counts(),
            HashMap::from([(String::from("adapter-a"), 1)])
        );

        cancel_token.cancel();
    }

463
464
465
466
467
468
469
470
471
472
    #[tokio::test]
    async fn test_schedule_override_can_disable_prefill_tracking() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
473
        let (scheduler, slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
474
475
476
477
478
479

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
480
481
482
483
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
484
485
486
487
488
489
490
491
492
                Some(&crate::config::RouterConfigOverride {
                    track_prefill_tokens: Some(false),
                    ..Default::default()
                }),
                true,
                None,
                0.0,
                None,
                None,
493
                None,
494
                None,
495
496
497
498
499
500
            )
            .await
            .unwrap();

        assert_eq!(
            slots
501
                .active_tokens(Instant::now())
502
503
504
505
506
507
508
509
                .get(&WorkerWithDpRank::new(0, 0))
                .copied(),
            Some(0)
        );

        cancel_token.cancel();
    }

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
    #[tokio::test]
    async fn test_schedule_uses_weighted_cached_tokens_for_active_tracking() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
        let (scheduler, slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
        let worker = WorkerWithDpRank::new(0, 0);

        let response = scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
                TierOverlapBlocks::default(),
                HashMap::from([(worker, 0.75)]),
                HashMap::from([(worker, 48)]),
                HashMap::new(),
                None,
                true,
                None,
                0.0,
                None,
                None,
                None,
                None,
            )
            .await
            .unwrap();

        assert_eq!(response.best_worker, worker);
        assert_eq!(response.cached_tokens, 48);
        assert_eq!(response.effective_overlap_blocks, 0.75);
        assert_eq!(
            slots.active_tokens(Instant::now()).get(&worker).copied(),
            Some(16),
            "weighted cached tokens should reduce tracked prefill load",
        );

        cancel_token.cancel();
    }

556
557
558
559
560
561
562
563
564
565
    #[tokio::test]
    async fn test_mark_prefill_completed_drains_pending_queue() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
566
567
        let (scheduler, _slots, _cfg_tx, cancel_token) =
            make_scheduler(workers, Some(0.5), true, None);
568
569
570
571
572
573

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
574
575
576
577
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
578
579
580
581
582
583
                None,
                true,
                None,
                0.0,
                None,
                None,
584
                None,
585
                None,
586
587
588
589
590
591
592
593
594
595
596
597
            )
            .await
            .unwrap();

        let queued = {
            let scheduler = Arc::clone(&scheduler);
            tokio::spawn(async move {
                scheduler
                    .schedule(
                        Some("req-2".to_string()),
                        64,
                        Some(vec![5, 6, 7, 8]),
598
599
600
601
                        TierOverlapBlocks::default(),
                        HashMap::new(),
                        HashMap::new(),
                        HashMap::new(),
602
603
604
605
606
607
                        None,
                        true,
                        None,
                        0.0,
                        None,
                        None,
608
                        None,
609
                        None,
610
611
612
613
614
                    )
                    .await
            })
        };

615
        wait_for_pending_count(&scheduler, 1).await;
616
617
618
619
620
621
622
623

        scheduler.mark_prefill_completed("req-1").await.unwrap();
        queued.await.unwrap().unwrap();
        assert_eq!(scheduler.pending_count(), 0);

        cancel_token.cancel();
    }

624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
    #[tokio::test]
    async fn test_remote_mark_prefill_completed_drains_pending_queue() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
        let (scheduler, slots, _cfg_tx, cancel_token) =
            make_scheduler(workers, Some(0.5), true, None);
        let event_tx = start_replica_sync(&slots, &cancel_token);

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
643
644
645
646
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
647
648
649
650
651
652
                None,
                true,
                None,
                0.0,
                None,
                None,
653
                None,
654
                None,
655
656
657
658
659
660
661
662
663
664
665
666
            )
            .await
            .unwrap();

        let queued = {
            let scheduler = Arc::clone(&scheduler);
            tokio::spawn(async move {
                scheduler
                    .schedule(
                        Some("req-2".to_string()),
                        64,
                        Some(vec![5, 6, 7, 8]),
667
668
669
670
                        TierOverlapBlocks::default(),
                        HashMap::new(),
                        HashMap::new(),
                        HashMap::new(),
671
672
673
674
675
676
                        None,
                        true,
                        None,
                        0.0,
                        None,
                        None,
677
                        None,
678
                        None,
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
                    )
                    .await
            })
        };

        wait_for_pending_count(&scheduler, 1).await;

        event_tx
            .send(ActiveSequenceEvent {
                request_id: "req-1".to_string(),
                worker: WorkerWithDpRank::new(0, 0),
                data: ActiveSequenceEventData::MarkPrefillCompleted,
                router_id: 1,
                lora_name: None,
            })
            .unwrap();

        tokio::time::timeout(Duration::from_millis(250), async {
            queued.await.unwrap().unwrap();
        })
        .await
        .unwrap();
        assert_eq!(scheduler.pending_count(), 0);

        cancel_token.cancel();
    }

    #[tokio::test]
    async fn test_remote_queue_update_notification_fires_after_drain() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
        let (scheduler, slots, _cfg_tx, cancel_token) =
            make_scheduler(workers, Some(0.5), true, None);
        let event_tx = start_replica_sync(&slots, &cancel_token);
        let mut queue_updates = scheduler.subscribe_queue_updates();

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
726
727
728
729
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
730
731
732
733
734
735
                None,
                true,
                None,
                0.0,
                None,
                None,
736
                None,
737
                None,
738
739
740
741
742
743
744
745
746
747
748
749
            )
            .await
            .unwrap();

        let queued = {
            let scheduler = Arc::clone(&scheduler);
            tokio::spawn(async move {
                scheduler
                    .schedule(
                        Some("req-2".to_string()),
                        64,
                        Some(vec![5, 6, 7, 8]),
750
751
752
753
                        TierOverlapBlocks::default(),
                        HashMap::new(),
                        HashMap::new(),
                        HashMap::new(),
754
755
756
757
758
759
                        None,
                        true,
                        None,
                        0.0,
                        None,
                        None,
760
                        None,
761
                        None,
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
                    )
                    .await
            })
        };

        wait_for_pending_count(&scheduler, 1).await;

        event_tx
            .send(ActiveSequenceEvent {
                request_id: "req-1".to_string(),
                worker: WorkerWithDpRank::new(0, 0),
                data: ActiveSequenceEventData::Free,
                router_id: 1,
                lora_name: None,
            })
            .unwrap();

        tokio::time::timeout(Duration::from_millis(250), queue_updates.changed())
            .await
            .unwrap()
            .unwrap();
        assert_eq!(scheduler.pending_count(), 0);
        queued.await.unwrap().unwrap();

        cancel_token.cancel();
    }

    #[tokio::test]
    async fn test_remote_free_drains_pending_queue() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
        let (scheduler, slots, _cfg_tx, cancel_token) =
            make_scheduler(workers, Some(0.5), true, None);
        let event_tx = start_replica_sync(&slots, &cancel_token);

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
808
809
810
811
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
812
813
814
815
816
817
                None,
                true,
                None,
                0.0,
                None,
                None,
818
                None,
819
                None,
820
821
822
823
824
825
826
827
828
829
830
831
            )
            .await
            .unwrap();

        let queued = {
            let scheduler = Arc::clone(&scheduler);
            tokio::spawn(async move {
                scheduler
                    .schedule(
                        Some("req-2".to_string()),
                        64,
                        Some(vec![5, 6, 7, 8]),
832
833
834
835
                        TierOverlapBlocks::default(),
                        HashMap::new(),
                        HashMap::new(),
                        HashMap::new(),
836
837
838
839
840
841
                        None,
                        true,
                        None,
                        0.0,
                        None,
                        None,
842
                        None,
843
                        None,
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
                    )
                    .await
            })
        };

        wait_for_pending_count(&scheduler, 1).await;

        event_tx
            .send(ActiveSequenceEvent {
                request_id: "req-1".to_string(),
                worker: WorkerWithDpRank::new(0, 0),
                data: ActiveSequenceEventData::Free,
                router_id: 1,
                lora_name: None,
            })
            .unwrap();

        tokio::time::timeout(Duration::from_millis(250), async {
            queued.await.unwrap().unwrap();
        })
        .await
        .unwrap();
        assert_eq!(scheduler.pending_count(), 0);

        cancel_token.cancel();
    }

871
872
873
874
875
876
877
878
879
880
    #[tokio::test]
    async fn test_free_updates_active_state() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(64),
                ..Default::default()
            },
        );
881
        let (scheduler, _slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
882
883
884
885
886
887

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![1, 2, 3, 4]),
888
889
890
891
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
892
893
894
895
896
897
                None,
                true,
                Some("adapter-a".to_string()),
                0.0,
                None,
                None,
898
                None,
899
                None,
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
            )
            .await
            .unwrap();
        assert_eq!(
            scheduler.get_active_lora_counts(),
            HashMap::from([(String::from("adapter-a"), 1)])
        );

        scheduler.free("req-1").await.unwrap();
        assert!(scheduler.get_active_lora_counts().is_empty());

        cancel_token.cancel();
    }

    #[tokio::test]
    async fn test_get_potential_loads_matches_slots() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(256),
                ..Default::default()
            },
        );
        workers.insert(
            1,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(256),
                ..Default::default()
            },
        );
931
        let (scheduler, slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
932
        let token_seq = vec![11, 22, 33, 44];
933
        let cached_tokens = HashMap::new();
934

935
936
        let (decode_blocks, prefill_tokens) =
            slots.potential_blocks_and_tokens(Some(&token_seq), 128, cached_tokens.clone());
937
938
939
940
941
942
943
944
945
946
947
        let mut expected: Vec<_> = decode_blocks
            .keys()
            .map(|worker| PotentialLoad {
                worker_id: worker.worker_id,
                dp_rank: worker.dp_rank,
                potential_prefill_tokens: prefill_tokens.get(worker).copied().unwrap_or(128),
                potential_decode_blocks: decode_blocks.get(worker).copied().unwrap_or(0),
            })
            .collect();
        expected.sort_by_key(|load| (load.worker_id, load.dp_rank));

948
        let mut actual = scheduler.get_potential_loads(Some(token_seq), 128, cached_tokens, true);
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
        actual.sort_by_key(|load| (load.worker_id, load.dp_rank));

        assert_eq!(actual.len(), expected.len());
        for (actual, expected) in actual.iter().zip(expected.iter()) {
            assert_eq!(actual.worker_id, expected.worker_id);
            assert_eq!(actual.dp_rank, expected.dp_rank);
            assert_eq!(
                actual.potential_prefill_tokens,
                expected.potential_prefill_tokens
            );
            assert_eq!(
                actual.potential_decode_blocks,
                expected.potential_decode_blocks
            );
        }

        cancel_token.cancel();
    }

968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
    #[tokio::test(start_paused = true)]
    async fn test_get_potential_loads_uses_decayed_prefill_tokens() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(256),
                ..Default::default()
            },
        );
        let estimator: Arc<dyn PrefillLoadEstimator> = Arc::new(FixedPrefillLoadEstimator {
            duration: Duration::from_secs(10),
        });
        let (scheduler, _slots, _cfg_tx, cancel_token) =
            make_scheduler(workers, None, true, Some(estimator));

        scheduler
            .schedule(
                Some("req-1".to_string()),
                100,
                Some(vec![1, 2, 3, 4]),
989
990
991
992
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
993
994
995
996
997
998
                None,
                true,
                None,
                0.0,
                None,
                None,
999
                None,
1000
                None,
1001
1002
1003
1004
1005
1006
            )
            .await
            .unwrap();

        tokio::time::advance(Duration::from_secs(6)).await;

1007
        let loads = scheduler.get_potential_loads(None, 0, HashMap::new(), true);
1008
1009
1010
1011
1012
1013
        assert_eq!(loads.len(), 1);
        assert_eq!(loads[0].potential_prefill_tokens, 40);

        cancel_token.cancel();
    }

1014
1015
1016
    #[tokio::test]
    async fn test_register_workers_uses_default_dp_fallback() {
        let (scheduler, _slots, _cfg_tx, cancel_token) =
1017
            make_scheduler(HashMap::new(), None, false, None);
1018
1019

        scheduler.register_workers(&HashSet::from([42]));
1020
        let loads = scheduler.get_potential_loads(None, 64, HashMap::new(), true);
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032

        assert_eq!(loads.len(), 1);
        assert_eq!(loads[0].worker_id, 42);
        assert_eq!(loads[0].dp_rank, 0);

        cancel_token.cancel();
    }

    #[tokio::test]
    async fn test_worker_watch_updates_slot_ranges() {
        let mut workers = HashMap::new();
        workers.insert(0, SimpleWorkerConfig::default());
1033
        let (scheduler, _slots, cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
1034
1035
1036

        assert_eq!(
            scheduler
1037
                .get_potential_loads(None, 64, HashMap::new(), true,)
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
                .len(),
            1
        );

        let mut updated_workers = HashMap::new();
        updated_workers.insert(
            0,
            SimpleWorkerConfig {
                data_parallel_size: 2,
                ..Default::default()
            },
        );
        updated_workers.insert(1, SimpleWorkerConfig::default());
        cfg_tx.send(updated_workers).unwrap();

        tokio::time::timeout(Duration::from_secs(1), async {
            loop {
                if scheduler
1056
                    .get_potential_loads(None, 64, HashMap::new(), true)
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
                    .len()
                    == 3
                {
                    break;
                }
                tokio::task::yield_now().await;
            }
        })
        .await
        .unwrap();

        cancel_token.cancel();
    }
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080

    #[tokio::test]
    async fn test_get_potential_loads_can_ignore_prefill_tokens() {
        let mut workers = HashMap::new();
        workers.insert(
            0,
            SimpleWorkerConfig {
                max_num_batched_tokens: Some(256),
                ..Default::default()
            },
        );
1081
        let (scheduler, _slots, _cfg_tx, cancel_token) = make_scheduler(workers, None, true, None);
1082
1083
1084
1085
1086
1087

        scheduler
            .schedule(
                Some("req-1".to_string()),
                64,
                Some(vec![11, 22]),
1088
1089
1090
1091
                TierOverlapBlocks::default(),
                HashMap::new(),
                HashMap::new(),
                HashMap::new(),
1092
1093
1094
1095
1096
1097
                None,
                true,
                None,
                0.0,
                None,
                None,
1098
                None,
1099
                None,
1100
1101
1102
1103
            )
            .await
            .unwrap();

1104
        let loads = scheduler.get_potential_loads(None, 64, HashMap::new(), false);
1105
1106
1107
1108
1109
        assert_eq!(loads.len(), 1);
        assert_eq!(loads[0].potential_prefill_tokens, 64);

        cancel_token.cancel();
    }
1110
}