prefill_router.rs 28.5 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use std::collections::HashSet;
5
6
use std::sync::{Arc, OnceLock};

7
use anyhow::Result;
8
use futures::StreamExt;
9
use tokio::sync::{OwnedSemaphorePermit, oneshot};
10
use tokio_util::sync::CancellationToken;
11
use tracing::Instrument;
12

13
14
15
16
use dynamo_kv_router::{
    config::{KvRouterConfig, RouterConfigOverride},
    protocols::{BlockExtraInfo, WorkerId},
};
17
18
19
use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
20
21
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
22
    },
23
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
24
25
26
27
};

use crate::{
    discovery::ModelManager,
28
    kv_router::KvPushRouter,
29
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
30
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
31
    protocols::common::timing::{RequestPhase, RequestTracker, WORKER_TYPE_PREFILL},
32
33
};

34
35
36
37
38
39
40
41
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// TODO: Separate prefill worker error from prefill router error
42
    /// Error during prefill execution
43
    #[error("Prefill execution failed: {0}")]
44
45
46
47
    PrefillError(
        String,
        #[source] Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
    ),
48
49
50
51
52
53

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

54
55
56
57
58
59
60
61
/// Result of the prefill phase in `generate()`.
enum PrefillOutcome {
    /// Bootstrap optimization: prefill spawned in background, bootstrap info ready
    Bootstrap(BootstrapInfo),
    /// Synchronous prefill completed with result
    Completed(PrefillResult),
}

62
/// The inner router used by PrefillRouter
63
#[derive(Clone)]
64
65
66
67
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
68
69
    /// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
    /// available in KV routing mode where the router has actual bookkeeping.
70
71
72
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

73
impl InnerPrefillRouter {
74
75
76
77
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
78
79
        &self,
        request: SingleIn<PreprocessedRequest>,
80
        target_worker: Option<u64>,
81
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
82
83
84
85
86
87
88
89
90
91
92
93
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
94
        match self {
95
96
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
97
98
99
100
        }
    }
}

101
102
103
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
104
///
105
106
107
108
/// Modes:
/// - Query-only: `query_instance_id` annotation present → returns worker IDs without execution
/// - Pre-routed: `prefill_worker_id`/`decode_worker_id` set → routes to specified workers
/// - Normal: Worker IDs determined by router based on KV cache state
109
110
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
111
112
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
113
114
    cancel_token: CancellationToken,
    router_mode: RouterMode,
115
    enforce_disagg: bool,
116
117
    /// Model name used to look up the worker monitor for prefill client registration
    model_name: String,
118
119
    /// Namespace used to look up the correct WorkerSet's worker monitor
    namespace: String,
120
    is_eagle: bool,
121
122
123
124
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
125
126
127
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
128
        enforce_disagg: bool,
129
    ) -> Arc<Self> {
130
131
        Arc::new(Self {
            prefill_router: OnceLock::new(),
132
133
            model_manager,
            endpoint_id: OnceLock::new(),
134
135
            cancel_token: CancellationToken::new(),
            router_mode,
136
            enforce_disagg,
137
            model_name: String::new(), // Not used for disabled router
138
            namespace: String::new(),  // Not used for disabled router
139
            is_eagle: false,
140
141
142
        })
    }

143
    #[expect(clippy::too_many_arguments)]
144
145
146
147
148
149
    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
150
        enforce_disagg: bool,
151
        model_name: String,
152
        namespace: String,
153
        is_eagle: bool,
154
155
156
157
158
159
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
160
161
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
162
163
            cancel_token: cancel_token.clone(),
            router_mode,
164
            enforce_disagg,
165
            model_name,
166
            namespace,
167
            is_eagle,
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

211
        // Store endpoint_id for later use in resolve_prefill_worker
212
213
214
215
216
217
218
219
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

220
        let inner_router = if self.router_mode.is_kv_routing() {
221
            // Create KV chooser using the endpoint (this is a prefill router)
222
            let kv_chooser = model_manager
223
224
225
226
227
                .kv_chooser_for(
                    &endpoint,
                    kv_cache_block_size,
                    kv_router_config,
                    WORKER_TYPE_PREFILL,
228
                    Some(self.model_name.clone()),
229
                    self.is_eagle,
230
                )
231
232
                .await?;

233
234
235
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

236
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
237
238
239
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
240
241
242
                monitor.set_prefill_client(client.clone());
            }

243
            // Build the PushRouter for prefill with KV mode using the shared client
244
245
246
247
248
249
250
251
252
253
254
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
255
256
257
            // Create client for simple router
            let client = endpoint.client().await?;

258
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
259
260
261
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
262
263
264
                monitor.set_prefill_client(client.clone());
            }

265
            // Create simple push router with the frontend's router mode
266
267
            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
            // available in KV routing mode where the router has actual bookkeeping.
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

290
    /// Select a prefill worker and resolve its bootstrap connection info.
291
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
292
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
293
    async fn resolve_prefill_worker(
294
        &self,
295
        req: &PreprocessedRequest,
296
        preselected_worker: Option<u64>,
297
    ) -> Option<(u64, u32, BootstrapInfo)> {
298
        let endpoint_id = self.endpoint_id.get()?;
299
        self.prefill_router.get()?;
300

301
        // Worker selection
302
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
303
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
304
305
306
307
308
309
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
310
311
        } else {
            // Use shared worker selection logic (update_states=false for peek behavior)
312
            // Extract LORA name and priority jump from routing hints
313
            let lora_name = req.routing.as_ref().and_then(|r| r.lora_name.clone());
314
315
316
317
318
            let priority_jump = req
                .routing
                .as_ref()
                .and_then(|r| r.priority_jump)
                .unwrap_or(0.0);
319
320
321
322
            let allowed_worker_ids = req
                .routing
                .as_ref()
                .and_then(|r| r.allowed_worker_ids.clone());
323
            let (routing_token_ids, block_mm_infos) = req.block_mm_routing_info();
324
            match self
325
326
327
328
329
330
                .query_prefill_worker(
                    routing_token_ids,
                    block_mm_infos,
                    false,
                    lora_name,
                    priority_jump,
331
                    allowed_worker_ids,
332
                )
333
                .await
334
            {
335
                Ok((worker_id, dp_rank)) => (worker_id, dp_rank),
336
337
                Err(_) => return None,
            }
338
339
        };

340
341
342
343
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
344
345
346
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

347
        let bootstrap_room: u64 = rand::random_range(0..=i64::MAX.cast_unsigned());
348

349
        tracing::debug!(
350
351
352
353
354
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
355
            router_mode = ?self.router_mode,
356
357
358
359
360
361
362
363
364
365
366
367
368
369
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

370
371
372
373
374
375
    /// Execute prefill with the given router and extract structured result.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
376
    /// optimization path to ensure `record_worker_full` completes before the phase changes.
377
378
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
379
380
381
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
382
        target_worker: Option<u64>,
383
        phase_permit: Option<OwnedSemaphorePermit>,
384
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
385
386
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
387
            .generate_to_worker(request, target_worker)
388
            .await
389
390
391
392
393
394
            .map_err(|e| {
                PrefillError::PrefillError(
                    "failed to route to prefill worker".to_string(),
                    Some(e.into()),
                )
            })?;
395

396
        // Drop phase permit now - routing is complete, record_worker_full was called in select_worker.
397
398
399
        // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
        drop(phase_permit);

400
        let Some(first_output) = prefill_response.next().await else {
401
402
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
403
                None,
404
            ));
405
406
        };

407
408
409
410
411
412
413
        if let Some(err) = first_output.err() {
            return Err(PrefillError::PrefillError(
                "Prefill router returned error in output".to_string(),
                Some(Box::new(err)),
            ));
        }

414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
430

431
        let Some(output) = &first_output.data else {
432
433
434
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
435
436
437
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
438
439
440
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
441
442
        };

443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
        // Extract prefill worker ID and dp_rank from disaggregated_params
        let prefill_worker_info =
            disaggregated_params
                .get("worker_id")
                .and_then(|worker_id_json| {
                    let worker_id = worker_id_json
                        .get("prefill_worker_id")
                        .and_then(|v| v.as_u64())?;
                    let dp_rank = worker_id_json
                        .get("prefill_dp_rank")
                        .and_then(|v| v.as_u64())
                        .map(|r| r as u32)
                        .unwrap_or(0);
                    Some((worker_id, dp_rank))
                });
458
459
460
461
462
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
463
            prefill_worker_info,
464
465
466
        ))
    }

467
468
469
470
471
472
    /// Spawn prefill as a background task.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// The `phase_permit` is passed to the spawned task and dropped after the first output,
    /// allowing the main task's `set_phase(Decode)` to proceed.
473
474
475
476
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
477
        phase_permit: OwnedSemaphorePermit,
478
    ) {
479
        let router = self.prefill_router.get().cloned();
480
481
482
483
484
485
486
487
488
489
490
        // Capture current span to propagate trace context to the spawned task
        let span = tracing::Span::current();

        tokio::spawn(
            async move {
                match Self::execute_prefill(
                    router,
                    prefill_request,
                    target_worker,
                    Some(phase_permit),
                )
491
                .await
492
493
494
495
496
497
498
                {
                    Ok(_) => {
                        tracing::debug!("Prefill background task completed");
                    }
                    Err(e) => {
                        tracing::warn!("Prefill background task error: {e:?}");
                    }
499
500
                }
            }
501
502
            .instrument(span),
        );
503
504
    }

505
506
507
    /// Query the best prefill worker without executing a request.
    /// Returns (worker_id, dp_rank).
    ///
508
    /// This is the shared worker selection logic used by both `resolve_prefill_worker`
509
    /// and `query_route`.
510
511
512
513
514
515
516
    /// Register externally-provided workers in the prefill router's slot tracker.
    pub fn register_workers(&self, worker_ids: &HashSet<WorkerId>) {
        if let Some(InnerPrefillRouter::KvRouter(r)) = self.prefill_router.get() {
            r.chooser.register_workers(worker_ids);
        }
    }

517
518
519
    pub async fn query_prefill_worker(
        &self,
        token_ids: &[u32],
520
        block_mm_infos: Option<&[Option<BlockExtraInfo>]>,
521
522
523
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
524
        allowed_worker_ids: Option<HashSet<WorkerId>>,
525
526
527
528
529
530
531
532
533
534
535
536
537
    ) -> Result<(u64, u32)> {
        let prefill_router = self
            .prefill_router
            .get()
            .ok_or_else(|| anyhow::anyhow!(PrefillError::NotActivated))?;

        match prefill_router {
            InnerPrefillRouter::KvRouter(r) => {
                let (worker, _overlap) = r
                    .chooser
                    .find_best_match(
                        None,
                        token_ids,
538
                        block_mm_infos,
539
540
541
542
                        None,
                        update_states,
                        lora_name,
                        priority_jump,
543
                        None,
544
                        allowed_worker_ids,
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
                    )
                    .await?;
                Ok((worker.worker_id, worker.dp_rank))
            }
            InnerPrefillRouter::SimpleRouter(r) => {
                let worker_id = if update_states {
                    r.select_next_worker()
                } else {
                    r.peek_next_worker()
                }
                .ok_or_else(|| anyhow::anyhow!("No workers available for prefill"))?;
                Ok((worker_id, 0))
            }
        }
    }

    /// Check if disaggregated mode is currently active (prefill router activated)
    pub fn is_activated(&self) -> bool {
        self.prefill_router.get().is_some()
    }
565
566
}

567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
589
        let (mut req, context) = request.into_parts();
590
        let request_id = context.id().to_string();
591
        let engine_ctx = context.context();
592

Yan Ru Pei's avatar
Yan Ru Pei committed
593
594
595
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

596
597
        // If prefill router is not activated (no prefill workers discovered),
        // this is aggregated mode — route directly to decode.
598
        // With --enforce-disagg, fail instead of falling back.
599
        if self.prefill_router.get().is_none() {
600
601
602
            if self.enforce_disagg {
                return Err(anyhow::anyhow!(PrefillError::NotActivated));
            }
603
604
605
606
607
608
609
610
611
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
612
        let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
613
614

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
615
616
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
617

618
        // Try to resolve prefill worker upfront: if we can get bootstrap info early,
619
        // spawn prefill in background and proceed to decode immediately.
620
621
622
623
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
624

625
626
627
628
629
630
631
        if self.router_mode.is_direct_routing() && preselected_worker.is_none() {
            return Err(anyhow::anyhow!(
                "Prefill worker ID required in Direct routing mode but none found in request. \
                 Expected prefill_worker_id to be set via x-prefill-instance-id header by external router (e.g., EPP)."
            ));
        }

632
633
        let prefill_result = async {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
634
                .resolve_prefill_worker(&prefill_req, preselected_worker)
635
                .await
636
            {
637
638
639
640
641
642
643
644
                // Bootstrap optimization path: spawn prefill in background
                // We successfully used the peeked worker, so we must now advance the router state
                // to ensure the next request gets a different worker.
                if !self.router_mode.is_kv_routing()
                    && let Some(router) = self.prefill_router.get()
                {
                    router.select_next_worker();
                }
645

646
647
648
649
                let routing = prefill_req.routing_mut();
                routing.prefill_worker_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
                prefill_req.bootstrap_info = Some(bootstrap_info.clone());
650

651
652
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
653

654
                // Pass phase permit to spawned task - it drops after first output (record_worker_full complete)
655
656
                // This allows set_phase(Decode) below to proceed only after prefill routing is done
                self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
657

658
                Ok(PrefillOutcome::Bootstrap(bootstrap_info))
659
660
661
            } else {
                // Original prefill path: wait for prefill to complete
                tracing::debug!("Using original prefill path");
662

663
                // Drop the phase permit - we wait for completion
664
665
                // so there's no race with set_phase(Decode) below
                drop(prefill_phase_permit);
666

667
668
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
669

670
671
672
673
674
675
676
677
678
                // In Direct mode, pass preselected_worker so execute_prefill uses
                // router.direct() instead of router.generate() (which bails in Direct mode).
                let (result, _worker_info) = Self::execute_prefill(
                    self.prefill_router.get().cloned(),
                    prefill_context,
                    preselected_worker,
                    None,
                )
                .await?;
679

680
                Ok(PrefillOutcome::Completed(result))
681
682
683
            }
        }
        .await;
684
685
686
687
688
689
690
691
692
693
694
695

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
696
            Ok(outcome) => {
697
                tracing::debug!("Prefill completed, proceeding to decode");
698

699
700
                // Set phase to Decode for the decode request.
                // In bootstrap path, this blocks until the spawned prefill task drops its permit
701
                // (after first output / record_worker_full completes), ensuring correct phase for routing.
702
                if let Some(ref tracker) = req.tracker {
703
704
                    let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
                    // Permit is dropped immediately - decode proceeds, no need to hold it
705
706
                }

707
                let mut decode_req = req;
708

709
710
711
712
713
714
715
                match outcome {
                    PrefillOutcome::Bootstrap(info) => {
                        decode_req.bootstrap_info = Some(info);
                    }
                    PrefillOutcome::Completed(result) => {
                        decode_req.prefill_result = Some(result);
                    }
716
717
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
718
719
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
720

721
722
723
724
                // Set router_config_override for decode:
                // - overlap_score_weight = 0 (no KV cache overlap scoring for decode)
                // - assume_kv_reuse = false (generate random hashes since decode workers
                //   may already have blocks cached from prefill transfer)
725
726
727
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
728
                    assume_kv_reuse: Some(false),
729
730
731
732
                    ..existing_override.unwrap_or_default()
                });

                // Map the modified request through with preserved context
733
                let decode_request = context.map(|_| decode_req);
734
735
                next.generate(decode_request).await
            }
736
            Err(PrefillError::NotActivated) => {
737
738
                tracing::error!("Prefill router not activated, failing request");
                Err(anyhow::anyhow!(PrefillError::NotActivated))
739
            }
740
            Err(e) => {
741
742
                tracing::error!(error = %e, "Remote prefill failed, failing request");
                Err(anyhow::anyhow!(e))
743
744
745
746
            }
        }
    }
}