prefill_router.rs 28.7 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use std::collections::HashSet;
5
6
use std::sync::{Arc, OnceLock};

7
use anyhow::Result;
8
use futures::StreamExt;
9
use tokio::sync::{OwnedSemaphorePermit, oneshot};
10
use tokio_util::sync::CancellationToken;
11
use tracing::Instrument;
12
13
14
15

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
16
17
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
18
    },
19
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
20
21
22
23
};

use crate::{
    discovery::ModelManager,
24
    kv_router::protocols::WorkerId,
25
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
26
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
27
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
28
    protocols::common::timing::{RequestPhase, RequestTracker, WORKER_TYPE_PREFILL},
29
30
};

31
32
33
34
35
36
37
38
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// TODO: Separate prefill worker error from prefill router error
39
    /// Error during prefill execution
40
    #[error("Prefill execution failed: {0}")]
41
42
43
44
    PrefillError(
        String,
        #[source] Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
    ),
45
46
47
48
49
50

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

51
52
53
54
55
56
57
58
/// Result of the prefill phase in `generate()`.
enum PrefillOutcome {
    /// Bootstrap optimization: prefill spawned in background, bootstrap info ready
    Bootstrap(BootstrapInfo),
    /// Synchronous prefill completed with result
    Completed(PrefillResult),
}

59
/// The inner router used by PrefillRouter
60
#[derive(Clone)]
61
62
63
64
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
65
66
    /// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
    /// available in KV routing mode where the router has actual bookkeeping.
67
68
69
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

70
impl InnerPrefillRouter {
71
72
73
74
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
75
76
        &self,
        request: SingleIn<PreprocessedRequest>,
77
        target_worker: Option<u64>,
78
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
79
80
81
82
83
84
85
86
87
88
89
90
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
91
        match self {
92
93
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
94
95
96
97
        }
    }
}

98
99
100
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
101
///
102
103
104
105
/// Modes:
/// - Query-only: `query_instance_id` annotation present → returns worker IDs without execution
/// - Pre-routed: `prefill_worker_id`/`decode_worker_id` set → routes to specified workers
/// - Normal: Worker IDs determined by router based on KV cache state
106
107
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
108
109
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
110
111
    cancel_token: CancellationToken,
    router_mode: RouterMode,
112
    decode_fallback: bool,
113
114
    /// Model name used to look up the worker monitor for prefill client registration
    model_name: String,
115
116
    /// Namespace used to look up the correct WorkerSet's worker monitor
    namespace: String,
117
118
119
120
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
121
122
123
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
124
        decode_fallback: bool,
125
    ) -> Arc<Self> {
126
127
        Arc::new(Self {
            prefill_router: OnceLock::new(),
128
129
            model_manager,
            endpoint_id: OnceLock::new(),
130
131
            cancel_token: CancellationToken::new(),
            router_mode,
132
            decode_fallback,
133
            model_name: String::new(), // Not used for disabled router
134
            namespace: String::new(),  // Not used for disabled router
135
136
137
        })
    }

138
    #[allow(clippy::too_many_arguments)]
139
140
141
142
143
144
    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
145
        decode_fallback: bool,
146
        model_name: String,
147
        namespace: String,
148
149
150
151
152
153
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
154
155
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
156
157
            cancel_token: cancel_token.clone(),
            router_mode,
158
            decode_fallback,
159
            model_name,
160
            namespace,
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

204
        // Store endpoint_id for later use in resolve_prefill_worker
205
206
207
208
209
210
211
212
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

213
        let inner_router = if self.router_mode.is_kv_routing() {
214
            // Create KV chooser using the endpoint (this is a prefill router)
215
            let kv_chooser = model_manager
216
217
218
219
220
221
                .kv_chooser_for(
                    &endpoint,
                    kv_cache_block_size,
                    kv_router_config,
                    WORKER_TYPE_PREFILL,
                )
222
223
                .await?;

224
225
226
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

227
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
228
229
230
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
231
232
233
                monitor.set_prefill_client(client.clone());
            }

234
            // Build the PushRouter for prefill with KV mode using the shared client
235
236
237
238
239
240
241
242
243
244
245
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
246
247
248
            // Create client for simple router
            let client = endpoint.client().await?;

249
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
250
251
252
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
253
254
255
                monitor.set_prefill_client(client.clone());
            }

256
            // Create simple push router with the frontend's router mode
257
258
            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
            // available in KV routing mode where the router has actual bookkeeping.
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

281
    /// Select a prefill worker and resolve its bootstrap connection info.
282
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
283
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
284
    async fn resolve_prefill_worker(
285
        &self,
286
        req: &PreprocessedRequest,
287
        preselected_worker: Option<u64>,
288
    ) -> Option<(u64, u32, BootstrapInfo)> {
289
        let endpoint_id = self.endpoint_id.get()?;
290
        self.prefill_router.get()?;
291

292
        // Worker selection
293
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
294
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
295
296
297
298
299
300
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
301
302
        } else {
            // Use shared worker selection logic (update_states=false for peek behavior)
303
            // Extract LORA name and priority jump from routing hints
304
            let lora_name = req.routing.as_ref().and_then(|r| r.lora_name.clone());
305
306
307
308
309
            let priority_jump = req
                .routing
                .as_ref()
                .and_then(|r| r.priority_jump)
                .unwrap_or(0.0);
310
311
312
313
            let allowed_worker_ids = req
                .routing
                .as_ref()
                .and_then(|r| r.allowed_worker_ids.clone());
314
            let (routing_token_ids, block_mm_infos) = req.block_mm_routing_info();
315
            match self
316
317
318
319
320
321
                .query_prefill_worker(
                    routing_token_ids,
                    block_mm_infos,
                    false,
                    lora_name,
                    priority_jump,
322
                    allowed_worker_ids,
323
                )
324
                .await
325
            {
326
                Ok((worker_id, dp_rank)) => (worker_id, dp_rank),
327
328
                Err(_) => return None,
            }
329
330
        };

331
332
333
334
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
335
336
337
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

338
        let bootstrap_room: u64 = rand::random_range(0..=i64::MAX as u64);
339

340
        tracing::debug!(
341
342
343
344
345
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
346
            router_mode = ?self.router_mode,
347
348
349
350
351
352
353
354
355
356
357
358
359
360
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

361
362
363
364
365
366
    /// Execute prefill with the given router and extract structured result.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
367
    /// optimization path to ensure `record_worker_full` completes before the phase changes.
368
369
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
370
371
372
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
373
        target_worker: Option<u64>,
374
        phase_permit: Option<OwnedSemaphorePermit>,
375
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
376
377
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
378
            .generate_to_worker(request, target_worker)
379
            .await
380
381
382
383
384
385
            .map_err(|e| {
                PrefillError::PrefillError(
                    "failed to route to prefill worker".to_string(),
                    Some(e.into()),
                )
            })?;
386

387
        // Drop phase permit now - routing is complete, record_worker_full was called in select_worker.
388
389
390
        // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
        drop(phase_permit);

391
        let Some(first_output) = prefill_response.next().await else {
392
393
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
394
                None,
395
            ));
396
397
        };

398
399
400
401
402
403
404
        if let Some(err) = first_output.err() {
            return Err(PrefillError::PrefillError(
                "Prefill router returned error in output".to_string(),
                Some(Box::new(err)),
            ));
        }

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
421

422
        let Some(output) = &first_output.data else {
423
424
425
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
426
427
428
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
429
430
431
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
432
433
        };

434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
        // Extract prefill worker ID and dp_rank from disaggregated_params
        let prefill_worker_info =
            disaggregated_params
                .get("worker_id")
                .and_then(|worker_id_json| {
                    let worker_id = worker_id_json
                        .get("prefill_worker_id")
                        .and_then(|v| v.as_u64())?;
                    let dp_rank = worker_id_json
                        .get("prefill_dp_rank")
                        .and_then(|v| v.as_u64())
                        .map(|r| r as u32)
                        .unwrap_or(0);
                    Some((worker_id, dp_rank))
                });
449
450
451
452
453
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
454
            prefill_worker_info,
455
456
457
        ))
    }

458
459
460
461
462
463
    /// Spawn prefill as a background task.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// The `phase_permit` is passed to the spawned task and dropped after the first output,
    /// allowing the main task's `set_phase(Decode)` to proceed.
464
465
466
467
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
468
        phase_permit: OwnedSemaphorePermit,
469
    ) {
470
        let router = self.prefill_router.get().cloned();
471
472
473
474
475
476
477
478
479
480
481
        // Capture current span to propagate trace context to the spawned task
        let span = tracing::Span::current();

        tokio::spawn(
            async move {
                match Self::execute_prefill(
                    router,
                    prefill_request,
                    target_worker,
                    Some(phase_permit),
                )
482
                .await
483
484
485
486
487
488
489
                {
                    Ok(_) => {
                        tracing::debug!("Prefill background task completed");
                    }
                    Err(e) => {
                        tracing::warn!("Prefill background task error: {e:?}");
                    }
490
491
                }
            }
492
493
            .instrument(span),
        );
494
495
    }

496
    /// Call the prefill router and extract structured prefill result, worker ID, and dp_rank.
497
498
499
    ///
    /// This is the synchronous prefill path - we wait for prefill to complete before proceeding.
    /// No phase permit is needed since `record_worker` completes before we return.
500
501
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
502
503
504
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
505
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
506
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
507
508
        // No phase permit needed - we wait for completion before changing phase
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None, None).await
509
    }
510
511
512
513

    /// Query the best prefill worker without executing a request.
    /// Returns (worker_id, dp_rank).
    ///
514
    /// This is the shared worker selection logic used by both `resolve_prefill_worker`
515
516
517
518
    /// and `query_route`.
    pub async fn query_prefill_worker(
        &self,
        token_ids: &[u32],
519
        block_mm_infos: Option<&[Option<BlockExtraInfo>]>,
520
521
522
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
523
        allowed_worker_ids: Option<HashSet<WorkerId>>,
524
525
526
527
528
529
530
531
532
533
534
535
536
    ) -> Result<(u64, u32)> {
        let prefill_router = self
            .prefill_router
            .get()
            .ok_or_else(|| anyhow::anyhow!(PrefillError::NotActivated))?;

        match prefill_router {
            InnerPrefillRouter::KvRouter(r) => {
                let (worker, _overlap) = r
                    .chooser
                    .find_best_match(
                        None,
                        token_ids,
537
                        block_mm_infos,
538
539
540
541
                        None,
                        update_states,
                        lora_name,
                        priority_jump,
542
                        None,
543
                        allowed_worker_ids,
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
                    )
                    .await?;
                Ok((worker.worker_id, worker.dp_rank))
            }
            InnerPrefillRouter::SimpleRouter(r) => {
                let worker_id = if update_states {
                    r.select_next_worker()
                } else {
                    r.peek_next_worker()
                }
                .ok_or_else(|| anyhow::anyhow!("No workers available for prefill"))?;
                Ok((worker_id, 0))
            }
        }
    }

    /// Check if disaggregated mode is currently active (prefill router activated)
    pub fn is_activated(&self) -> bool {
        self.prefill_router.get().is_some()
    }
564
565
}

566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
588
        let (mut req, context) = request.into_parts();
589
        let request_id = context.id().to_string();
590
        let engine_ctx = context.context();
591

Yan Ru Pei's avatar
Yan Ru Pei committed
592
593
594
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

595
596
        // If prefill router is not activated (no prefill workers discovered),
        // this is aggregated mode — route directly to decode.
597
        if self.prefill_router.get().is_none() {
598
599
600
601
602
603
604
605
606
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
607
        let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
608
609

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
610
611
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
612

613
        // Try to resolve prefill worker upfront: if we can get bootstrap info early,
614
        // spawn prefill in background and proceed to decode immediately.
615
616
617
618
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
619

620
621
        let prefill_result = async {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
622
                .resolve_prefill_worker(&prefill_req, preselected_worker)
623
                .await
624
            {
625
626
627
628
629
630
631
632
                // Bootstrap optimization path: spawn prefill in background
                // We successfully used the peeked worker, so we must now advance the router state
                // to ensure the next request gets a different worker.
                if !self.router_mode.is_kv_routing()
                    && let Some(router) = self.prefill_router.get()
                {
                    router.select_next_worker();
                }
633

634
635
636
637
                let routing = prefill_req.routing_mut();
                routing.prefill_worker_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
                prefill_req.bootstrap_info = Some(bootstrap_info.clone());
638

639
640
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
641

642
                // Pass phase permit to spawned task - it drops after first output (record_worker_full complete)
643
644
                // This allows set_phase(Decode) below to proceed only after prefill routing is done
                self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
645

646
                Ok(PrefillOutcome::Bootstrap(bootstrap_info))
647
648
649
            } else {
                // Original prefill path: wait for prefill to complete
                tracing::debug!("Using original prefill path");
650

651
652
653
                // Drop the phase permit before calling call_prefill - we wait for completion
                // so there's no race with set_phase(Decode) below
                drop(prefill_phase_permit);
654

655
656
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
657

658
                let (result, _worker_info) = self.call_prefill(prefill_context).await?;
659

660
                Ok(PrefillOutcome::Completed(result))
661
662
663
            }
        }
        .await;
664
665
666
667
668
669
670
671
672
673
674
675

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
676
            Ok(outcome) => {
677
                tracing::debug!("Prefill completed, proceeding to decode");
678

679
680
                // Set phase to Decode for the decode request.
                // In bootstrap path, this blocks until the spawned prefill task drops its permit
681
                // (after first output / record_worker_full completes), ensuring correct phase for routing.
682
                if let Some(ref tracker) = req.tracker {
683
684
                    let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
                    // Permit is dropped immediately - decode proceeds, no need to hold it
685
686
                }

687
                let mut decode_req = req;
688

689
690
691
692
693
694
695
                match outcome {
                    PrefillOutcome::Bootstrap(info) => {
                        decode_req.bootstrap_info = Some(info);
                    }
                    PrefillOutcome::Completed(result) => {
                        decode_req.prefill_result = Some(result);
                    }
696
697
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
698
699
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
700

701
702
703
704
                // Set router_config_override for decode:
                // - overlap_score_weight = 0 (no KV cache overlap scoring for decode)
                // - assume_kv_reuse = false (generate random hashes since decode workers
                //   may already have blocks cached from prefill transfer)
705
706
707
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
708
                    assume_kv_reuse: Some(false),
709
710
711
712
                    ..existing_override.unwrap_or_default()
                });

                // Map the modified request through with preserved context
713
                let decode_request = context.map(|_| decode_req);
714
715
                next.generate(decode_request).await
            }
716
            Err(PrefillError::NotActivated) => {
717
                if !self.decode_fallback {
718
                    tracing::error!(
719
                        "No prefill workers discovered yet and decode fallback is disabled. Failing request."
720
721
722
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
723
                tracing::debug!("No prefill workers discovered yet, falling back to decode-only");
724
725
                next.generate(context.map(|_| req)).await
            }
726
            Err(e) => {
727
                if !self.decode_fallback {
728
729
                    tracing::error!(
                        error = %e,
730
                        "Remote prefill failed and decode fallback is disabled. Failing request."
731
732
733
                    );
                    return Err(anyhow::anyhow!(e));
                }
734
735
736
737
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
738
739
740
741
742
                next.generate(context.map(|_| req)).await
            }
        }
    }
}