prefill_router.rs 28 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use rand::Rng;
9
use tokio::sync::{OwnedSemaphorePermit, oneshot};
10
use tokio_util::sync::CancellationToken;
11
use tracing::Instrument;
12
13
14
15

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
16
17
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
18
    },
19
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
20
21
22
23
};

use crate::{
    discovery::ModelManager,
24
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride, protocols::BlockExtraInfo},
25
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
26
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
27
    protocols::common::timing::{RequestPhase, RequestTracker, WORKER_TYPE_PREFILL},
28
29
};

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// Error during prefill execution
    /// TODO: Separate prefill worker error from prefill router error
    #[error("Prefill execution failed: {0}")]
    PrefillError(String),

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

47
/// The inner router used by PrefillRouter
48
#[derive(Clone)]
49
50
51
52
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
53
54
    /// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
    /// available in KV routing mode where the router has actual bookkeeping.
55
56
57
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

58
impl InnerPrefillRouter {
59
60
61
62
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
63
64
        &self,
        request: SingleIn<PreprocessedRequest>,
65
        target_worker: Option<u64>,
66
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
67
68
69
70
71
72
73
74
75
76
77
78
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
79
        match self {
80
81
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
82
83
84
85
        }
    }
}

86
87
88
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
89
///
90
91
92
93
/// Modes:
/// - Query-only: `query_instance_id` annotation present → returns worker IDs without execution
/// - Pre-routed: `prefill_worker_id`/`decode_worker_id` set → routes to specified workers
/// - Normal: Worker IDs determined by router based on KV cache state
94
95
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
96
97
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
98
99
    cancel_token: CancellationToken,
    router_mode: RouterMode,
100
    enforce_disagg: bool,
101
102
    /// Model name used to look up the worker monitor for prefill client registration
    model_name: String,
103
104
    /// Namespace used to look up the correct WorkerSet's worker monitor
    namespace: String,
105
106
107
108
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
109
110
111
112
113
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        enforce_disagg: bool,
    ) -> Arc<Self> {
114
115
        Arc::new(Self {
            prefill_router: OnceLock::new(),
116
117
            model_manager,
            endpoint_id: OnceLock::new(),
118
119
            cancel_token: CancellationToken::new(),
            router_mode,
120
            enforce_disagg,
121
            model_name: String::new(), // Not used for disabled router
122
            namespace: String::new(),  // Not used for disabled router
123
124
125
        })
    }

126
    #[allow(clippy::too_many_arguments)]
127
128
129
130
131
132
    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
133
        enforce_disagg: bool,
134
        model_name: String,
135
        namespace: String,
136
137
138
139
140
141
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
142
143
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
144
145
            cancel_token: cancel_token.clone(),
            router_mode,
146
            enforce_disagg,
147
            model_name,
148
            namespace,
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

192
193
194
195
196
197
198
199
200
        // Store endpoint_id for later use in build_bootstrap_info
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

201
        let inner_router = if self.router_mode.is_kv_routing() {
202
            // Create KV chooser using the endpoint (this is a prefill router)
203
            let kv_chooser = model_manager
204
205
206
207
208
209
                .kv_chooser_for(
                    &endpoint,
                    kv_cache_block_size,
                    kv_router_config,
                    WORKER_TYPE_PREFILL,
                )
210
211
                .await?;

212
213
214
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

215
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
216
217
218
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
219
220
221
                monitor.set_prefill_client(client.clone());
            }

222
            // Build the PushRouter for prefill with KV mode using the shared client
223
224
225
226
227
228
229
230
231
232
233
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
234
235
236
            // Create client for simple router
            let client = endpoint.client().await?;

237
            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
238
239
240
            if let Some(monitor) =
                model_manager.get_worker_monitor_for_namespace(&self.model_name, &self.namespace)
            {
241
242
243
                monitor.set_prefill_client(client.clone());
            }

244
            // Create simple push router with the frontend's router mode
245
246
            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
            // available in KV routing mode where the router has actual bookkeeping.
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

269
270
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
271
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
272
    async fn build_bootstrap_info(
273
        &self,
274
        req: &PreprocessedRequest,
275
        preselected_worker: Option<u64>,
276
    ) -> Option<(u64, u32, BootstrapInfo)> {
277
        let endpoint_id = self.endpoint_id.get()?;
278
        let _prefill_router = self.prefill_router.get()?;
279

280
        // Worker selection
281
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
282
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
283
284
285
286
287
288
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
289
290
        } else {
            // Use shared worker selection logic (update_states=false for peek behavior)
291
            // Extract LORA name and priority jump from routing hints
292
            let lora_name = req.routing.as_ref().and_then(|r| r.lora_name.clone());
293
294
295
296
297
            let priority_jump = req
                .routing
                .as_ref()
                .and_then(|r| r.priority_jump)
                .unwrap_or(0.0);
298
            let (routing_token_ids, block_mm_infos) = req.block_mm_routing_info();
299
            match self
300
301
302
303
304
305
306
                .query_prefill_worker(
                    routing_token_ids,
                    block_mm_infos,
                    false,
                    lora_name,
                    priority_jump,
                )
307
                .await
308
            {
309
                Ok((worker_id, dp_rank)) => (worker_id, dp_rank),
310
311
                Err(_) => return None,
            }
312
313
        };

314
315
316
317
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
318
319
320
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

321
        let bootstrap_room: u64 = rand::rng().random();
322
323
324
325
326
327
328

        tracing::info!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
329
            router_mode = ?self.router_mode,
330
331
332
333
334
335
336
337
338
339
340
341
342
343
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

344
345
346
347
348
349
    /// Execute prefill with the given router and extract structured result.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
350
    /// optimization path to ensure `record_worker_full` completes before the phase changes.
351
352
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
353
354
355
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
356
        target_worker: Option<u64>,
357
        phase_permit: Option<OwnedSemaphorePermit>,
358
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
359
360
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
361
            .generate_to_worker(request, target_worker)
362
363
364
            .await
            .map_err(|e| PrefillError::PrefillError(e.to_string()))?;

365
        // Drop phase permit now - routing is complete, record_worker_full was called in select_worker.
366
367
368
        // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
        drop(phase_permit);

369
        let Some(first_output) = prefill_response.next().await else {
370
371
372
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
            ));
373
374
        };

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
391

392
        if let Some(err) = first_output.err() {
393
394
395
            return Err(PrefillError::PrefillError(format!(
                "Prefill router returned error in output: {err:?}"
            )));
396
397
398
        }

        let Some(output) = &first_output.data else {
399
400
401
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
402
403
404
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
405
406
407
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
408
409
        };

410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
        // Extract prefill worker ID and dp_rank from disaggregated_params
        let prefill_worker_info =
            disaggregated_params
                .get("worker_id")
                .and_then(|worker_id_json| {
                    let worker_id = worker_id_json
                        .get("prefill_worker_id")
                        .and_then(|v| v.as_u64())?;
                    let dp_rank = worker_id_json
                        .get("prefill_dp_rank")
                        .and_then(|v| v.as_u64())
                        .map(|r| r as u32)
                        .unwrap_or(0);
                    Some((worker_id, dp_rank))
                });
425
426
427
428
429
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
430
            prefill_worker_info,
431
432
433
        ))
    }

434
435
436
437
438
439
    /// Spawn prefill as a background task.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// The `phase_permit` is passed to the spawned task and dropped after the first output,
    /// allowing the main task's `set_phase(Decode)` to proceed.
440
441
442
443
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
444
        phase_permit: OwnedSemaphorePermit,
445
    ) {
446
        let router = self.prefill_router.get().cloned();
447
448
449
450
451
452
453
454
455
456
457
        // Capture current span to propagate trace context to the spawned task
        let span = tracing::Span::current();

        tokio::spawn(
            async move {
                match Self::execute_prefill(
                    router,
                    prefill_request,
                    target_worker,
                    Some(phase_permit),
                )
458
                .await
459
460
461
462
463
464
465
                {
                    Ok(_) => {
                        tracing::debug!("Prefill background task completed");
                    }
                    Err(e) => {
                        tracing::warn!("Prefill background task error: {e:?}");
                    }
466
467
                }
            }
468
469
            .instrument(span),
        );
470
471
    }

472
    /// Call the prefill router and extract structured prefill result, worker ID, and dp_rank.
473
474
475
    ///
    /// This is the synchronous prefill path - we wait for prefill to complete before proceeding.
    /// No phase permit is needed since `record_worker` completes before we return.
476
477
    ///
    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
478
479
480
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
481
    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
482
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
483
484
        // No phase permit needed - we wait for completion before changing phase
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None, None).await
485
    }
486
487
488
489
490
491
492
493
494

    /// Query the best prefill worker without executing a request.
    /// Returns (worker_id, dp_rank).
    ///
    /// This is the shared worker selection logic used by both `build_bootstrap_info`
    /// and `query_route`.
    pub async fn query_prefill_worker(
        &self,
        token_ids: &[u32],
495
        block_mm_infos: Option<&[Option<BlockExtraInfo>]>,
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
        update_states: bool,
        lora_name: Option<String>,
        priority_jump: f64,
    ) -> Result<(u64, u32)> {
        let prefill_router = self
            .prefill_router
            .get()
            .ok_or_else(|| anyhow::anyhow!(PrefillError::NotActivated))?;

        match prefill_router {
            InnerPrefillRouter::KvRouter(r) => {
                let (worker, _overlap) = r
                    .chooser
                    .find_best_match(
                        None,
                        token_ids,
512
                        block_mm_infos,
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
                        None,
                        update_states,
                        lora_name,
                        priority_jump,
                    )
                    .await?;
                Ok((worker.worker_id, worker.dp_rank))
            }
            InnerPrefillRouter::SimpleRouter(r) => {
                let worker_id = if update_states {
                    r.select_next_worker()
                } else {
                    r.peek_next_worker()
                }
                .ok_or_else(|| anyhow::anyhow!("No workers available for prefill"))?;
                Ok((worker_id, 0))
            }
        }
    }

    /// Check if disaggregated mode is currently active (prefill router activated)
    pub fn is_activated(&self) -> bool {
        self.prefill_router.get().is_some()
    }
537
538
}

539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
561
        let (mut req, context) = request.into_parts();
562
        let request_id = context.id().to_string();
563
        let engine_ctx = context.context();
564

Yan Ru Pei's avatar
Yan Ru Pei committed
565
566
567
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

568
569
        // If prefill router is not activated, skip directly to decode
        if self.prefill_router.get().is_none() {
570
571
572
573
574
575
576
577
578
579
580
581
            if self.enforce_disagg {
                return Err(anyhow::anyhow!(PrefillError::NotActivated));
            }
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
582
        let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
583
584

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
585
586
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
587

588
589
        // Try build_bootstrap_info optimization: if we can get bootstrap info upfront,
        // spawn prefill in background and proceed to decode immediately.
590
591
592
593
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
594

595
596
597
598
        let prefill_result = async {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
                .build_bootstrap_info(&prefill_req, preselected_worker)
                .await
599
            {
600
601
602
603
604
605
606
607
                // Bootstrap optimization path: spawn prefill in background
                // We successfully used the peeked worker, so we must now advance the router state
                // to ensure the next request gets a different worker.
                if !self.router_mode.is_kv_routing()
                    && let Some(router) = self.prefill_router.get()
                {
                    router.select_next_worker();
                }
608

609
610
611
612
                let routing = prefill_req.routing_mut();
                routing.prefill_worker_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
                prefill_req.bootstrap_info = Some(bootstrap_info.clone());
613

614
615
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
616

617
                // Pass phase permit to spawned task - it drops after first output (record_worker_full complete)
618
619
                // This allows set_phase(Decode) below to proceed only after prefill routing is done
                self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
620

621
622
623
624
                Ok((None, Some(worker_id), Some(bootstrap_info)))
            } else {
                // Original prefill path: wait for prefill to complete
                tracing::debug!("Using original prefill path");
625

626
627
628
                // Drop the phase permit before calling call_prefill - we wait for completion
                // so there's no race with set_phase(Decode) below
                drop(prefill_phase_permit);
629

630
631
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
632

633
634
635
636
637
                let result = self.call_prefill(prefill_context).await;

                result.map(|(result, worker_info)| {
                    (Some(result), worker_info.map(|(id, _)| id), None)
                })
638
639
640
            }
        }
        .await;
641
642
643
644
645
646
647
648
649
650
651
652

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
653
654
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
655

656
657
                // Set phase to Decode for the decode request.
                // In bootstrap path, this blocks until the spawned prefill task drops its permit
658
                // (after first output / record_worker_full completes), ensuring correct phase for routing.
659
                if let Some(ref tracker) = req.tracker {
660
661
                    let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
                    // Permit is dropped immediately - decode proceeds, no need to hold it
662
663
                }

664
                let mut decode_req = req;
665

666
                // Update request with prefill result
667
                if let Some(prefill_result) = maybe_prefill_result {
668
669
670
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
671
672
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
673

674
675
676
677
678
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

679
680
681
682
                // Set router_config_override for decode:
                // - overlap_score_weight = 0 (no KV cache overlap scoring for decode)
                // - assume_kv_reuse = false (generate random hashes since decode workers
                //   may already have blocks cached from prefill transfer)
683
684
685
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
686
                    assume_kv_reuse: Some(false),
687
688
689
690
                    ..existing_override.unwrap_or_default()
                });

                // Map the modified request through with preserved context
691
                let decode_request = context.map(|_| decode_req);
692
693
                next.generate(decode_request).await
            }
694
695
696
697
698
699
700
701
702
703
            Err(PrefillError::NotActivated) => {
                if self.enforce_disagg {
                    tracing::error!(
                        "Prefill router not activated, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
                tracing::debug!("Prefill router not activated, falling back to decode-only");
                next.generate(context.map(|_| req)).await
            }
704
            Err(e) => {
705
706
707
708
709
710
711
                if self.enforce_disagg {
                    tracing::error!(
                        error = %e,
                        "Remote prefill failed, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(e));
                }
712
713
714
715
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
716
717
718
719
720
                next.generate(context.map(|_| req)).await
            }
        }
    }
}