prefill_router.rs 25.1 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use rand::Rng;
9
10
11
12
13
14
use tokio::sync::oneshot;
use tokio_util::sync::CancellationToken;

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
15
16
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
17
    },
18
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
19
20
21
22
};

use crate::{
    discovery::ModelManager,
23
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride},
24
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
25
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
26
    protocols::common::timing::{RequestPhase, RequestTracker},
27
    protocols::openai::nvext::WorkerIdInfo,
28
29
};

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// Error during prefill execution
    /// TODO: Separate prefill worker error from prefill router error
    #[error("Prefill execution failed: {0}")]
    PrefillError(String),

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

47
/// The inner router used by PrefillRouter
48
#[derive(Clone)]
49
50
51
52
53
54
55
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

56
impl InnerPrefillRouter {
57
58
59
60
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
61
62
        &self,
        request: SingleIn<PreprocessedRequest>,
63
        target_worker: Option<u64>,
64
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
65
66
67
68
69
70
71
72
73
74
75
76
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
77
        match self {
78
79
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
80
81
82
83
        }
    }
}

84
85
86
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
87
88
89
///
/// Supports regular Dynamo and GAIE integrated mode via query_instance_id state machine:
/// - GAIE Stage 1: query_instance_id transitions "" -> "prefill" -> "decode", returns only worker IDs
90
/// - GAIE Stage 2: routing.prefill_worker_id/routing.decode_worker_id are set, full execution with specified workers
91
/// - Non-GAIE: like GAIE Stage 2 but the worker ids have to be determined.
92
93
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
94
95
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
96
97
    cancel_token: CancellationToken,
    router_mode: RouterMode,
98
    enforce_disagg: bool,
99
100
101
102
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
103
104
105
106
107
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        enforce_disagg: bool,
    ) -> Arc<Self> {
108
109
        Arc::new(Self {
            prefill_router: OnceLock::new(),
110
111
            model_manager,
            endpoint_id: OnceLock::new(),
112
113
            cancel_token: CancellationToken::new(),
            router_mode,
114
            enforce_disagg,
115
116
117
118
119
120
121
122
123
        })
    }

    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
124
        enforce_disagg: bool,
125
126
127
128
129
130
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
131
132
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
133
134
            cancel_token: cancel_token.clone(),
            router_mode,
135
            enforce_disagg,
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

179
180
181
182
183
184
185
186
187
        // Store endpoint_id for later use in build_bootstrap_info
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

188
        let inner_router = if self.router_mode.is_kv_routing() {
189
            // Create KV chooser using the endpoint
190
            let kv_chooser = model_manager
191
                .kv_chooser_for(&endpoint, kv_cache_block_size, kv_router_config)
192
193
                .await?;

194
195
196
197
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

            // Build the PushRouter for prefill with KV mode using the shared client
198
199
200
201
202
203
204
205
206
207
208
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
209
210
211
            // Create client for simple router
            let client = endpoint.client().await?;

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
            // Create simple push router with the frontend's router mode
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

235
236
237
238
239
    /// Generate a unique bootstrap room ID for disaggregated serving
    fn generate_bootstrap_room() -> u64 {
        rand::rng().random()
    }

240
241
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
242
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
243
    async fn build_bootstrap_info(
244
        &self,
245
        req: &PreprocessedRequest,
246
        preselected_worker: Option<u64>,
247
    ) -> Option<(u64, u32, BootstrapInfo)> {
248
        let endpoint_id = self.endpoint_id.get()?;
249
250
        let prefill_router = self.prefill_router.get()?;

251
        // Worker selection
252
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
253
            // GAIE Stage 2: use pre-selected worker
254
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
255
256
257
258
259
260
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
261
262
263
264
265
266
        } else if self.router_mode.is_kv_routing() {
            // KV mode: use find_best_match
            let kv_router = match prefill_router {
                InnerPrefillRouter::KvRouter(r) => r,
                _ => return None,
            };
267
268
269
270
271
272
273
274
            match kv_router
                .chooser
                .find_best_match(None, &req.token_ids, None, false)
                .await
            {
                Ok((worker, _overlap)) => (worker.worker_id, worker.dp_rank),
                Err(_) => return None,
            }
275
276
277
278
        } else {
            // Non-KV mode: use PushRouter's stateful selection
            let worker_id = prefill_router.select_next_worker()?;
            (worker_id, 0)
279
280
        };

281
282
283
284
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
285
286
287
288
289
290
291
292
293
294
295
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

        let bootstrap_room = Self::generate_bootstrap_room();

        tracing::info!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
296
            router_mode = ?self.router_mode,
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

    /// Execute prefill with the given router and extract structured result
312
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization)
313
314
315
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
316
        target_worker: Option<u64>,
317
318
319
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
320
            .generate_to_worker(request, target_worker)
321
322
323
            .await
            .map_err(|e| PrefillError::PrefillError(e.to_string()))?;

324
        let Some(first_output) = prefill_response.next().await else {
325
326
327
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
            ));
328
329
        };

330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
346

347
        if let Some(err) = first_output.err() {
348
349
350
            return Err(PrefillError::PrefillError(format!(
                "Prefill router returned error in output: {err:?}"
            )));
351
352
353
        }

        let Some(output) = &first_output.data else {
354
355
356
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
357
358
359
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
360
361
362
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
363
364
        };

365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
        // Extract prefill worker ID from disaggregated_params
        let prefill_worker_id = disaggregated_params
            .get("worker_id")
            .and_then(|worker_id_json| {
                worker_id_json
                    .get("prefill_worker_id")
                    .and_then(|v| v.as_u64())
            });
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
            prefill_worker_id,
        ))
    }

    /// Spawn prefill as a background task
383
384
385
386
387
388
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization)
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
    ) {
389
390
391
        let router = self.prefill_router.get().cloned();

        tokio::spawn(async move {
392
            match Self::execute_prefill(router, prefill_request, target_worker).await {
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
                Ok(_) => {
                    tracing::debug!("Prefill background task completed");
                }
                Err(e) => {
                    tracing::warn!("Prefill background task error: {e:?}");
                }
            }
        });
    }

    /// Call the prefill router and extract structured prefill result and worker ID
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
408
409
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None).await
410
411
412
    }
}

413
414
415
416
417
418
419
420
421
422
423
424
425
/// GAIE helper functions for preparing prefill requests
impl PrefillRouter {
    /// Prepare prefill request for GAIE flows
    /// - Stage 1: Sets query_instance_id:prefill annotation
    /// - Stage 2: Sets backend_instance_id to target prefill worker
    fn prepare_prefill_for_gaie(prefill_req: &mut PreprocessedRequest, is_gaie_stage1: bool) {
        if is_gaie_stage1 {
            // GAIE Stage 1: Set query_instance_id to "prefill" for prefill worker selection
            prefill_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            prefill_req
                .annotations
426
                .push(format!("query_instance_id:{}", RequestPhase::Prefill));
427
428
429
430
431
        } else if let Some(prefill_worker_id) = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id)
        {
432
433
            // GAIE Stage 2: Route to pre-selected prefill worker from the stage 1
            tracing::debug!(
434
                prefill_worker_id = prefill_worker_id,
435
436
                "GAIE Stage 2: Routing prefill to pre-selected worker"
            );
437
            prefill_req.routing_mut().backend_instance_id = Some(prefill_worker_id);
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
        }
    }

    /// Prepare decode request for GAIE Stage 1
    /// Extracts prefill_worker_id from prefill result and sets decode annotations
    fn prepare_decode_for_gaie_stage1(
        decode_req: &mut PreprocessedRequest,
        prefill_result: &PrefillResult,
    ) {
        let prefill_worker_id = prefill_result
            .disaggregated_params
            .get("worker_id")
            .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
            .and_then(|info| info.prefill_worker_id);

        if let Some(worker_id) = prefill_worker_id {
            decode_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            decode_req
                .annotations
459
                .push(format!("query_instance_id:{}", RequestPhase::Decode));
460
461
462
463
464
465
466
            decode_req
                .annotations
                .push(format!("prefill_worker_id:{worker_id}"));
        }
    }
}

467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
489
        let (mut req, context) = request.into_parts();
490
        let request_id = context.id().to_string();
491
        let engine_ctx = context.context();
492

493
494
495
496
497
498
        // GAIE Stage 1: the presence of the empty query_instance_id signals query-only mode
        // State machine: "" -> "prefill" -> "decode" (disagg) OR "" -> aggregated worker (agg fallback)
        let is_gaie_stage1 = req
            .get_annotation_value("query_instance_id")
            .is_some_and(|s| s.is_empty());

Yan Ru Pei's avatar
Yan Ru Pei committed
499
500
501
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
        // GAIE Stage 1: Check if prefill router is activated - if not, skip to decode
        if is_gaie_stage1 && self.prefill_router.get().is_none() {
            tracing::debug!("GAIE Stage 1: Prefill router not activated, skipping to decode");
            if self.enforce_disagg {
                return Err(anyhow::anyhow!(PrefillError::NotActivated));
            }
            // Fall back to decode-only
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
        tracker.set_phase(RequestPhase::Prefill);
        tracker.record_prefill_start();

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
522
523
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
524

525
526
        // Prepare prefill request for GAIE flows (Stage 1 or Stage 2)
        Self::prepare_prefill_for_gaie(&mut prefill_req, is_gaie_stage1);
527

528
        // Try build_bootstrap_info optimization (skip for GAIE Stage 1 which needs query-only flow)
529
530
531
532
533
        // For GAIE Stage 2, use prefill_worker_id if provided
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
534
535
536

        let prefill_result = if !is_gaie_stage1
            && let Some((worker_id, dp_rank, bootstrap_info)) = self
537
538
                .build_bootstrap_info(&prefill_req, preselected_worker)
                .await
539
540
541
542
543
544
545
        {
            // Bootstrap optimization path: spawn prefill in background
            let routing = prefill_req.routing_mut();
            routing.prefill_worker_id = Some(worker_id);
            routing.backend_instance_id = Some(worker_id); // Route prefill to the SAME worker we got bootstrap_info from
            routing.dp_rank = Some(dp_rank);
            prefill_req.bootstrap_info = Some(bootstrap_info.clone());
546

547
548
            let prefill_context = Context::with_id(prefill_req, request_id.clone());
            engine_ctx.link_child(prefill_context.context());
549

550
            self.spawn_prefill_task(prefill_context, Some(worker_id));
551

552
            Ok((None, Some(worker_id), Some(bootstrap_info)))
553
        } else {
554
555
556
557
558
            // Original prefill path: wait for prefill to complete
            tracing::debug!(
                is_gaie_stage1 = is_gaie_stage1,
                "Using original prefill path"
            );
559

560
561
            let prefill_context = Context::with_id(prefill_req, request_id.clone());
            engine_ctx.link_child(prefill_context.context());
562

563
564
565
            self.call_prefill(prefill_context)
                .await
                .map(|(result, worker_id)| (Some(result), worker_id, None))
566
        };
567
568
569
570
571
572
573
574
575
576
577
578

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
579
580
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
581

582
583
584
585
586
                // Set phase to Decode for the decode request
                if let Some(ref tracker) = req.tracker {
                    tracker.set_phase(RequestPhase::Decode);
                }

587
                let mut decode_req = req;
588

589
590
591
592
593
594
595
                // Update request with prefill result
                if is_gaie_stage1 {
                    if let Some(ref prefill_result) = maybe_prefill_result {
                        Self::prepare_decode_for_gaie_stage1(&mut decode_req, prefill_result);
                    }
                } else if let Some(prefill_result) = maybe_prefill_result {
                    // Normal or GAIE Stage 2: Set prefill_result for decode
596
597
598
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
599
600
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
601

602
603
604
605
606
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

607
608
609
610
611
612
613
                // Set router_config_override for decode: overlap_score_weight = 0
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
                    ..existing_override.unwrap_or_default()
                });

614
                // GAIE Stage 2: Route to pre-selected decode worker if specified
615
616
617
618
                if let Some(decode_worker_id) =
                    decode_req.routing.as_ref().and_then(|r| r.decode_worker_id)
                {
                    decode_req.routing_mut().backend_instance_id = Some(decode_worker_id);
619
620
621
622
623
624
                    tracing::debug!(
                        decode_worker_id = decode_worker_id,
                        "GAIE Stage 2: Routing decode to pre-selected worker"
                    );
                }

625
                // Map the modified request through with preserved context
626
                let decode_request = context.map(|_| decode_req);
627
628
                next.generate(decode_request).await
            }
629
630
631
632
633
634
635
636
637
638
            Err(PrefillError::NotActivated) => {
                if self.enforce_disagg {
                    tracing::error!(
                        "Prefill router not activated, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
                tracing::debug!("Prefill router not activated, falling back to decode-only");
                next.generate(context.map(|_| req)).await
            }
639
            Err(e) => {
640
641
642
643
644
645
646
                if self.enforce_disagg {
                    tracing::error!(
                        error = %e,
                        "Remote prefill failed, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(e));
                }
647
648
649
650
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
651
652
653
654
655
                next.generate(context.map(|_| req)).await
            }
        }
    }
}