prefill_router.rs 28.2 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use tokio::sync::{OwnedSemaphorePermit, oneshot};
9
use tokio_util::sync::CancellationToken;
10
use tracing::Instrument;
11
12
13
14

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
15
16
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
17
    },
18
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
19
20
21
22
};

use crate::{
    discovery::ModelManager,
23
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
24
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
25
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
26
    protocols::common::timing::{RequestPhase, RequestTracker, WORKER_TYPE_PREFILL},
27
28
};

29
30
31
32
33
34
35
36
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// TODO: Separate prefill worker error from prefill router error
37
    /// Error during prefill execution
38
    #[error("Prefill execution failed: {0}")]
39
40
41
42
    PrefillError(
        String,
        #[source] Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
    ),
43
44
45
46
47
48

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

49
/// The inner router used by PrefillRouter
50
#[derive(Clone)]
51
52
53
54
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
55
56
    /// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
    /// available in KV routing mode where the router has actual bookkeeping.
57
58
59
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

60
impl InnerPrefillRouter {
61
62
63
64
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
65
66
        &self,
        request: SingleIn<PreprocessedRequest>,
67
        target_worker: Option<u64>,
68
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
69
70
71
72
73
74
75
76
77
78
79
80
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
81
        match self {
82
83
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
84
85
86
87
        }
    }
}

88
89
90
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
91
///
92
93
94
95
/// Modes:
/// - Query-only: `query_instance_id` annotation present → returns worker IDs without execution
/// - Pre-routed: `prefill_worker_id`/`decode_worker_id` set → routes to specified workers
/// - Normal: Worker IDs determined by router based on KV cache state
96
97
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
98
99
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
100
101
    cancel_token: CancellationToken,
    router_mode: RouterMode,
102
    decode_fallback: bool,
103
104
    /// Model name used to look up the worker monitor for prefill client registration
    model_name: String,
105
106
    /// Namespace used to look up the correct WorkerSet's worker monitor
    namespace: String,
107
108
109
110
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
111
112
113
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
114
        decode_fallback: bool,
115
    ) -> Arc<Self> {
116
117
        Arc::new(Self {
            prefill_router: OnceLock::new(),
118
119
            model_manager,
            endpoint_id: OnceLock::new(),
120
121
            cancel_token: CancellationToken::new(),
            router_mode,
122
            decode_fallback,
123
            model_name: String::new(), // Not used for disabled router
124
            namespace: String::new(),  // Not used for disabled router
125
126
127
        })
    }

128
    #[allow(clippy::too_many_arguments)]
129
130
131
132
133
134
    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
135
        decode_fallback: bool,
136
        model_name: String,
137
        namespace: String,
138
139
140
141
142
143
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
144
145
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
146
147
            cancel_token: cancel_token.clone(),
            router_mode,
148
            decode_fallback,
149
            model_name,
150
            namespace,
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

194
195
196
197
198
199
200
201
202
        // Store endpoint_id for later use in build_bootstrap_info
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

203
        let inner_router = if self.router_mode.is_kv_routing() {
204
            // Create KV chooser using the endpoint (this is a prefill router)
205
            let kv_chooser = model_manager
206
207
208
209
210
211
                .kv_chooser_for(
                    &endpoint,
                    kv_cache_block_size,
                    kv_router_config,
                    WORKER_TYPE_PREFILL,
                )
212
213
                .await?;

214
215
216
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

217
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
218
219
220
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
221
222
223
                monitor.set_prefill_client(client.clone());
            }

224
            // Build the PushRouter for prefill with KV mode using the shared client
225
226
227
228
229
230
231
232
233
234
235
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
236
237
238
            // Create client for simple router
            let client = endpoint.client().await?;

239
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
240
241
242
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
243
244
245
                monitor.set_prefill_client(client.clone());
            }

246
            // Create simple push router with the frontend's router mode
247
248
            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
            // available in KV routing mode where the router has actual bookkeeping.
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

271
272
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
273
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
274
    async fn build_bootstrap_info(
275
        &self,
276
        req: &PreprocessedRequest,
277
        preselected_worker: Option<u64>,
278
    ) -> Option<(u64, u32, BootstrapInfo)> {
279
        let endpoint_id = self.endpoint_id.get()?;
280
        let _prefill_router = self.prefill_router.get()?;
281

282
        // Worker selection
283
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
284
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
285
286
287
288
289
290
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
291
292
        } else {
            // Use shared worker selection logic (update_states=false for peek behavior)
293
            // Extract LORA name and priority jump from routing hints
294
            let lora_name = req.routing.as_ref().and_then(|r| r.lora_name.clone());
295
296
297
298
299
            let priority_jump = req
                .routing
                .as_ref()
                .and_then(|r| r.priority_jump)
                .unwrap_or(0.0);
300
            let (routing_token_ids, block_mm_infos) = req.block_mm_routing_info();
301
            match self
302
303
304
305
306
307
308
                .query_prefill_worker(
                    routing_token_ids,
                    block_mm_infos,
                    false,
                    lora_name,
                    priority_jump,
                )
309
                .await
310
            {
311
                Ok((worker_id, dp_rank)) => (worker_id, dp_rank),
312
313
                Err(_) => return None,
            }
314
315
        };

316
317
318
319
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
320
321
322
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

323
        let bootstrap_room: u64 = rand::random_range(0..=i64::MAX as u64);
324

325
        tracing::debug!(
326
327
328
329
330
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
331
            router_mode = ?self.router_mode,
332
333
334
335
336
337
338
339
340
341
342
343
344
345
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

346
347
348
349
350
351
    /// Execute prefill with the given router and extract structured result.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
352
    /// optimization path to ensure `record_worker_full` completes before the phase changes.
353
354
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
355
356
357
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
358
        target_worker: Option<u64>,
359
        phase_permit: Option<OwnedSemaphorePermit>,
360
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
361
362
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
363
            .generate_to_worker(request, target_worker)
364
            .await
365
366
367
368
369
370
            .map_err(|e| {
                PrefillError::PrefillError(
                    "failed to route to prefill worker".to_string(),
                    Some(e.into()),
                )
            })?;
371

372
        // Drop phase permit now - routing is complete, record_worker_full was called in select_worker.
373
374
375
        // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
        drop(phase_permit);

376
        let Some(first_output) = prefill_response.next().await else {
377
378
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
379
                None,
380
            ));
381
382
        };

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
399

400
        if let Some(err) = first_output.err() {
401
402
403
404
            return Err(PrefillError::PrefillError(
                "Prefill router returned error in output".to_string(),
                Some(Box::new(err)),
            ));
405
406
407
        }

        let Some(output) = &first_output.data else {
408
409
410
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
411
412
413
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
414
415
416
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
417
418
        };

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
        // Extract prefill worker ID and dp_rank from disaggregated_params
        let prefill_worker_info =
            disaggregated_params
                .get("worker_id")
                .and_then(|worker_id_json| {
                    let worker_id = worker_id_json
                        .get("prefill_worker_id")
                        .and_then(|v| v.as_u64())?;
                    let dp_rank = worker_id_json
                        .get("prefill_dp_rank")
                        .and_then(|v| v.as_u64())
                        .map(|r| r as u32)
                        .unwrap_or(0);
                    Some((worker_id, dp_rank))
                });
434
435
436
437
438
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
439
            prefill_worker_info,
440
441
442
        ))
    }

443
444
445
446
447
448
    /// Spawn prefill as a background task.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// The `phase_permit` is passed to the spawned task and dropped after the first output,
    /// allowing the main task's `set_phase(Decode)` to proceed.
449
450
451
452
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
453
        phase_permit: OwnedSemaphorePermit,
454
    ) {
455
        let router = self.prefill_router.get().cloned();
456
457
458
459
460
461
462
463
464
465
466
        // Capture current span to propagate trace context to the spawned task
        let span = tracing::Span::current();

        tokio::spawn(
            async move {
                match Self::execute_prefill(
                    router,
                    prefill_request,
                    target_worker,
                    Some(phase_permit),
                )
467
                .await
468
469
470
471
472
473
474
                {
                    Ok(_) => {
                        tracing::debug!("Prefill background task completed");
                    }
                    Err(e) => {
                        tracing::warn!("Prefill background task error: {e:?}");
                    }
475
476
                }
            }
477
478
            .instrument(span),
        );
479
480
    }

481
    /// Call the prefill router and extract structured prefill result, worker ID, and dp_rank.
482
483
484
    ///
    /// This is the synchronous prefill path - we wait for prefill to complete before proceeding.
    /// No phase permit is needed since `record_worker` completes before we return.
485
486
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
487
488
489
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
490
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
491
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
492
493
        // No phase permit needed - we wait for completion before changing phase
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None, None).await
494
    }
495
496
497
498
499
500
501
502
503

    /// Query the best prefill worker without executing a request.
    /// Returns (worker_id, dp_rank).
    ///
    /// This is the shared worker selection logic used by both `build_bootstrap_info`
    /// and `query_route`.
    pub async fn query_prefill_worker(
        &self,
        token_ids: &[u32],
504
        block_mm_infos: Option<&[Option<BlockExtraInfo>]>,
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
    ) -> Result<(u64, u32)> {
        let prefill_router = self
            .prefill_router
            .get()
            .ok_or_else(|| anyhow::anyhow!(PrefillError::NotActivated))?;

        match prefill_router {
            InnerPrefillRouter::KvRouter(r) => {
                let (worker, _overlap) = r
                    .chooser
                    .find_best_match(
                        None,
                        token_ids,
521
                        block_mm_infos,
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
                        None,
                        update_states,
                        lora_name,
                        priority_jump,
                    )
                    .await?;
                Ok((worker.worker_id, worker.dp_rank))
            }
            InnerPrefillRouter::SimpleRouter(r) => {
                let worker_id = if update_states {
                    r.select_next_worker()
                } else {
                    r.peek_next_worker()
                }
                .ok_or_else(|| anyhow::anyhow!("No workers available for prefill"))?;
                Ok((worker_id, 0))
            }
        }
    }

    /// Check if disaggregated mode is currently active (prefill router activated)
    pub fn is_activated(&self) -> bool {
        self.prefill_router.get().is_some()
    }
546
547
}

548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
570
        let (mut req, context) = request.into_parts();
571
        let request_id = context.id().to_string();
572
        let engine_ctx = context.context();
573

Yan Ru Pei's avatar
Yan Ru Pei committed
574
575
576
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

577
578
        // If prefill router is not activated (no prefill workers discovered),
        // this is aggregated mode — route directly to decode.
579
        if self.prefill_router.get().is_none() {
580
581
582
583
584
585
586
587
588
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
589
        let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
590
591

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
592
593
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
594

595
596
        // Try build_bootstrap_info optimization: if we can get bootstrap info upfront,
        // spawn prefill in background and proceed to decode immediately.
597
598
599
600
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
601

602
603
604
605
        let prefill_result = async {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
                .build_bootstrap_info(&prefill_req, preselected_worker)
                .await
606
            {
607
608
609
610
611
612
613
614
                // Bootstrap optimization path: spawn prefill in background
                // We successfully used the peeked worker, so we must now advance the router state
                // to ensure the next request gets a different worker.
                if !self.router_mode.is_kv_routing()
                    && let Some(router) = self.prefill_router.get()
                {
                    router.select_next_worker();
                }
615

616
617
618
619
                let routing = prefill_req.routing_mut();
                routing.prefill_worker_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
                prefill_req.bootstrap_info = Some(bootstrap_info.clone());
620

621
622
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
623

624
                // Pass phase permit to spawned task - it drops after first output (record_worker_full complete)
625
626
                // This allows set_phase(Decode) below to proceed only after prefill routing is done
                self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
627

628
629
630
631
                Ok((None, Some(worker_id), Some(bootstrap_info)))
            } else {
                // Original prefill path: wait for prefill to complete
                tracing::debug!("Using original prefill path");
632

633
634
635
                // Drop the phase permit before calling call_prefill - we wait for completion
                // so there's no race with set_phase(Decode) below
                drop(prefill_phase_permit);
636

637
638
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
639

640
641
642
643
644
                let result = self.call_prefill(prefill_context).await;

                result.map(|(result, worker_info)| {
                    (Some(result), worker_info.map(|(id, _)| id), None)
                })
645
646
647
            }
        }
        .await;
648
649
650
651
652
653
654
655
656
657
658
659

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
660
661
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
662

663
664
                // Set phase to Decode for the decode request.
                // In bootstrap path, this blocks until the spawned prefill task drops its permit
665
                // (after first output / record_worker_full completes), ensuring correct phase for routing.
666
                if let Some(ref tracker) = req.tracker {
667
668
                    let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
                    // Permit is dropped immediately - decode proceeds, no need to hold it
669
670
                }

671
                let mut decode_req = req;
672

673
                // Update request with prefill result
674
                if let Some(prefill_result) = maybe_prefill_result {
675
676
677
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
678
679
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
680

681
682
683
684
685
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

686
687
688
689
                // Set router_config_override for decode:
                // - overlap_score_weight = 0 (no KV cache overlap scoring for decode)
                // - assume_kv_reuse = false (generate random hashes since decode workers
                //   may already have blocks cached from prefill transfer)
690
691
692
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
693
                    assume_kv_reuse: Some(false),
694
695
696
697
                    ..existing_override.unwrap_or_default()
                });

                // Map the modified request through with preserved context
698
                let decode_request = context.map(|_| decode_req);
699
700
                next.generate(decode_request).await
            }
701
            Err(PrefillError::NotActivated) => {
702
                if !self.decode_fallback {
703
                    tracing::error!(
704
                        "No prefill workers discovered yet and decode fallback is disabled. Failing request."
705
706
707
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
708
                tracing::debug!("No prefill workers discovered yet, falling back to decode-only");
709
710
                next.generate(context.map(|_| req)).await
            }
711
            Err(e) => {
712
                if !self.decode_fallback {
713
714
                    tracing::error!(
                        error = %e,
715
                        "Remote prefill failed and decode fallback is disabled. Failing request."
716
717
718
                    );
                    return Err(anyhow::anyhow!(e));
                }
719
720
721
722
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
723
724
725
726
727
                next.generate(context.map(|_| req)).await
            }
        }
    }
}