prefill_router.rs 23.8 KB
Newer Older
1
2
3
4
5
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use rand::Rng;
9
10
11
12
13
14
use tokio::sync::oneshot;
use tokio_util::sync::CancellationToken;

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
15
16
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
17
18
19
20
21
22
    },
    protocols::{annotated::Annotated, maybe_error::MaybeError},
};

use crate::{
    discovery::ModelManager,
23
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride},
24
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
25
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
26
    protocols::common::timing::RequestPhase,
27
    protocols::openai::nvext::WorkerIdInfo,
28
29
};

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// Error during prefill execution
    /// TODO: Separate prefill worker error from prefill router error
    #[error("Prefill execution failed: {0}")]
    PrefillError(String),

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

47
/// The inner router used by PrefillRouter
48
#[derive(Clone)]
49
50
51
52
53
54
55
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

56
57
58
59
60
61
62
63
64
65
66
67
68
impl InnerPrefillRouter {
    /// Execute prefill generation through the underlying router
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        match self {
            InnerPrefillRouter::KvRouter(router) => router.generate(request).await,
            InnerPrefillRouter::SimpleRouter(router) => router.generate(request).await,
        }
    }
}

69
70
71
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
72
73
74
///
/// Supports regular Dynamo and GAIE integrated mode via query_instance_id state machine:
/// - GAIE Stage 1: query_instance_id transitions "" -> "prefill" -> "decode", returns only worker IDs
75
/// - GAIE Stage 2: routing.prefill_worker_id/routing.decode_worker_id are set, full execution with specified workers
76
/// - Non-GAIE: like GAIE Stage 2 but the worker ids have to be determined.
77
78
79
80
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
    cancel_token: CancellationToken,
    router_mode: RouterMode,
81
    enforce_disagg: bool,
82
83
84
85
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
86
    pub fn disabled(router_mode: RouterMode, enforce_disagg: bool) -> Arc<Self> {
87
88
89
90
        Arc::new(Self {
            prefill_router: OnceLock::new(),
            cancel_token: CancellationToken::new(),
            router_mode,
91
            enforce_disagg,
92
93
94
95
96
97
98
99
100
        })
    }

    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
101
        enforce_disagg: bool,
102
103
104
105
106
107
108
109
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
            cancel_token: cancel_token.clone(),
            router_mode,
110
            enforce_disagg,
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

        let inner_router = if self.router_mode.is_kv_routing() {
155
            // Create KV chooser using the endpoint
156
            let kv_chooser = model_manager
157
                .kv_chooser_for(&endpoint, kv_cache_block_size, kv_router_config)
158
159
                .await?;

160
161
162
163
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

            // Build the PushRouter for prefill with KV mode using the shared client
164
165
166
167
168
169
170
171
172
173
174
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
175
176
177
            // Create client for simple router
            let client = endpoint.client().await?;

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
            // Create simple push router with the frontend's router mode
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

201
202
203
204
205
    /// Generate a unique bootstrap room ID for disaggregated serving
    fn generate_bootstrap_room() -> u64 {
        rand::rng().random()
    }

206
207
208
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
    /// Otherwise, query for the best worker.
209
    async fn build_bootstrap_info(
210
        &self,
211
        req: &PreprocessedRequest,
212
        preselected_worker: Option<u64>,
213
214
215
216
217
218
219
    ) -> Option<(u64, u32, BootstrapInfo)> {
        let prefill_router = self.prefill_router.get()?;

        // Only works with KvRouter
        let kv_router = match prefill_router {
            InnerPrefillRouter::KvRouter(r) => r,
            InnerPrefillRouter::SimpleRouter(_) => return None,
220
221
        };

222
223
        // Use pre-selected worker (GAIE Stage 2) or query for best worker
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
224
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
        } else {
            match kv_router
                .chooser
                .find_best_match(None, &req.token_ids, None, false)
                .await
            {
                Ok((worker, _overlap)) => (worker.worker_id, worker.dp_rank),
                Err(_) => return None,
            }
240
241
        };

242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
        // Look up bootstrap endpoint from discovery
        let endpoint = kv_router
            .chooser
            .get_disaggregated_endpoint(worker_id)
            .await?;
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

        let bootstrap_room = Self::generate_bootstrap_room();

        tracing::info!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

    /// Execute prefill with the given router and extract structured result
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
            .generate(request)
            .await
            .map_err(|e| PrefillError::PrefillError(e.to_string()))?;

283
        let Some(first_output) = prefill_response.next().await else {
284
285
286
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
            ));
287
288
        };

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
305

306
        if let Some(err) = first_output.err() {
307
308
309
            return Err(PrefillError::PrefillError(format!(
                "Prefill router returned error in output: {err:?}"
            )));
310
311
312
        }

        let Some(output) = &first_output.data else {
313
314
315
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
316
317
318
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
319
320
321
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
322
323
        };

324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
        // Extract prefill worker ID from disaggregated_params
        let prefill_worker_id = disaggregated_params
            .get("worker_id")
            .and_then(|worker_id_json| {
                worker_id_json
                    .get("prefill_worker_id")
                    .and_then(|v| v.as_u64())
            });
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
            prefill_worker_id,
        ))
    }

    /// Spawn prefill as a background task
    fn spawn_prefill_task(&self, prefill_request: SingleIn<PreprocessedRequest>) {
        let router = self.prefill_router.get().cloned();

        tokio::spawn(async move {
            match Self::execute_prefill(router, prefill_request).await {
                Ok(_) => {
                    tracing::debug!("Prefill background task completed");
                }
                Err(e) => {
                    tracing::warn!("Prefill background task error: {e:?}");
                }
            }
        });
    }

    /// Call the prefill router and extract structured prefill result and worker ID
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        Self::execute_prefill(self.prefill_router.get().cloned(), request).await
363
364
365
    }
}

366
367
368
369
370
371
372
373
374
375
376
377
378
/// GAIE helper functions for preparing prefill requests
impl PrefillRouter {
    /// Prepare prefill request for GAIE flows
    /// - Stage 1: Sets query_instance_id:prefill annotation
    /// - Stage 2: Sets backend_instance_id to target prefill worker
    fn prepare_prefill_for_gaie(prefill_req: &mut PreprocessedRequest, is_gaie_stage1: bool) {
        if is_gaie_stage1 {
            // GAIE Stage 1: Set query_instance_id to "prefill" for prefill worker selection
            prefill_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            prefill_req
                .annotations
379
                .push(format!("query_instance_id:{}", RequestPhase::Prefill));
380
381
382
383
384
        } else if let Some(prefill_worker_id) = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id)
        {
385
386
            // GAIE Stage 2: Route to pre-selected prefill worker from the stage 1
            tracing::debug!(
387
                prefill_worker_id = prefill_worker_id,
388
389
                "GAIE Stage 2: Routing prefill to pre-selected worker"
            );
390
            prefill_req.routing_mut().backend_instance_id = Some(prefill_worker_id);
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
        }
    }

    /// Prepare decode request for GAIE Stage 1
    /// Extracts prefill_worker_id from prefill result and sets decode annotations
    fn prepare_decode_for_gaie_stage1(
        decode_req: &mut PreprocessedRequest,
        prefill_result: &PrefillResult,
    ) {
        let prefill_worker_id = prefill_result
            .disaggregated_params
            .get("worker_id")
            .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
            .and_then(|info| info.prefill_worker_id);

        if let Some(worker_id) = prefill_worker_id {
            decode_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            decode_req
                .annotations
412
                .push(format!("query_instance_id:{}", RequestPhase::Decode));
413
414
415
416
417
418
419
            decode_req
                .annotations
                .push(format!("prefill_worker_id:{worker_id}"));
        }
    }
}

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
        let (req, context) = request.into_parts();
        let request_id = context.id().to_string();
444
        let engine_ctx = context.context();
445

446
447
448
449
450
451
        // GAIE Stage 1: the presence of the empty query_instance_id signals query-only mode
        // State machine: "" -> "prefill" -> "decode" (disagg) OR "" -> aggregated worker (agg fallback)
        let is_gaie_stage1 = req
            .get_annotation_value("query_instance_id")
            .is_some_and(|s| s.is_empty());

Yan Ru Pei's avatar
Yan Ru Pei committed
452
453
454
455
456
457
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

        // Prepare prefill request with max_tokens = 1
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
458

459
460
        // Prepare prefill request for GAIE flows (Stage 1 or Stage 2)
        Self::prepare_prefill_for_gaie(&mut prefill_req, is_gaie_stage1);
461

462
        // Try build_bootstrap_info optimization (skip for GAIE Stage 1 which needs query-only flow)
463
464
465
466
467
        // For GAIE Stage 2, use prefill_worker_id if provided
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
468
469
470
471
472
473
474
475
        let prefill_result = if !is_gaie_stage1 {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
                .build_bootstrap_info(&prefill_req, preselected_worker)
                .await
            {
                let bootstrap_room = bootstrap_info.bootstrap_room;

                // Prepare request with bootstrap_room and force routing to specific worker
476
477
478
                let routing = prefill_req.routing_mut();
                routing.backend_instance_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
479
480
481
482
483
484
485
486
487
                let extra_args = prefill_req
                    .extra_args
                    .get_or_insert_with(|| serde_json::json!({}));
                if let Some(obj) = extra_args.as_object_mut() {
                    obj.insert(
                        "bootstrap_room".to_string(),
                        serde_json::json!(bootstrap_room),
                    );
                }
488

489
490
491
492
493
494
                // Set phase to Prefill and record prefill start time if tracking is enabled
                if let Some(ref tracker) = req.tracker {
                    tracker.set_phase(RequestPhase::Prefill);
                    tracker.record_prefill_start();
                }

495
496
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
497

498
                self.spawn_prefill_task(prefill_context);
499

500
501
502
503
504
                Ok((None, Some(worker_id), Some(bootstrap_info)))
            } else {
                // Fallback to original: Wait for prefill to complete
                tracing::debug!("Using original prefill path");

505
506
507
508
509
510
                // Set phase to Prefill and record prefill start time if tracking is enabled
                if let Some(ref tracker) = req.tracker {
                    tracker.set_phase(RequestPhase::Prefill);
                    tracker.record_prefill_start();
                }

511
512
513
514
515
516
517
518
519
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());

                self.call_prefill(prefill_context)
                    .await
                    .map(|(result, worker_id)| (Some(result), worker_id, None))
            }
        } else {
            // GAIE Stage 1: Use original path (no bootstrap optimization)
520
521
522
523
524
525
            // But first check if prefill router is activated - if not, skip to avoid setting phase
            if self.prefill_router.get().is_none() {
                tracing::debug!("GAIE Stage 1: Prefill router not activated, skipping to decode");
                Err(PrefillError::NotActivated)
            } else {
                tracing::debug!("Using original prefill path (GAIE Stage 1)");
526

527
528
529
530
531
532
533
534
535
536
537
538
539
                // Set phase to Prefill and record prefill start time if tracking is enabled
                if let Some(ref tracker) = req.tracker {
                    tracker.set_phase(RequestPhase::Prefill);
                    tracker.record_prefill_start();
                }

                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());

                self.call_prefill(prefill_context)
                    .await
                    .map(|(result, worker_id)| (Some(result), worker_id, None))
            }
540
        };
541
542
543
544
545
546
547
548
549
550
551
552

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
553
554
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
555

556
557
558
559
560
                // Set phase to Decode for the decode request
                if let Some(ref tracker) = req.tracker {
                    tracker.set_phase(RequestPhase::Decode);
                }

561
                let mut decode_req = req;
562

563
564
565
566
567
568
569
                // Update request with prefill result
                if is_gaie_stage1 {
                    if let Some(ref prefill_result) = maybe_prefill_result {
                        Self::prepare_decode_for_gaie_stage1(&mut decode_req, prefill_result);
                    }
                } else if let Some(prefill_result) = maybe_prefill_result {
                    // Normal or GAIE Stage 2: Set prefill_result for decode
570
571
572
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
573
574
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
575

576
577
578
579
580
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

581
582
583
584
585
586
587
                // Set router_config_override for decode: overlap_score_weight = 0
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
                    ..existing_override.unwrap_or_default()
                });

588
                // GAIE Stage 2: Route to pre-selected decode worker if specified
589
590
591
592
                if let Some(decode_worker_id) =
                    decode_req.routing.as_ref().and_then(|r| r.decode_worker_id)
                {
                    decode_req.routing_mut().backend_instance_id = Some(decode_worker_id);
593
594
595
596
597
598
                    tracing::debug!(
                        decode_worker_id = decode_worker_id,
                        "GAIE Stage 2: Routing decode to pre-selected worker"
                    );
                }

599
                // Map the modified request through with preserved context
600
                let decode_request = context.map(|_| decode_req);
601
602
                next.generate(decode_request).await
            }
603
604
605
606
607
608
609
610
611
612
            Err(PrefillError::NotActivated) => {
                if self.enforce_disagg {
                    tracing::error!(
                        "Prefill router not activated, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
                tracing::debug!("Prefill router not activated, falling back to decode-only");
                next.generate(context.map(|_| req)).await
            }
613
            Err(e) => {
614
615
616
617
618
619
620
                if self.enforce_disagg {
                    tracing::error!(
                        error = %e,
                        "Remote prefill failed, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(e));
                }
621
622
623
624
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
625
626
627
628
629
                next.generate(context.map(|_| req)).await
            }
        }
    }
}