prefill_router.rs 22 KB
Newer Older
1
2
3
4
5
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use rand::Rng;
9
10
11
12
13
14
use tokio::sync::oneshot;
use tokio_util::sync::CancellationToken;

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
15
16
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
17
18
19
20
21
22
    },
    protocols::{annotated::Annotated, maybe_error::MaybeError},
};

use crate::{
    discovery::ModelManager,
23
    kv_router::{KvPushRouter, KvRouterConfig, QueryInstanceType, RouterConfigOverride},
24
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
25
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
26
    protocols::openai::nvext::WorkerIdInfo,
27
28
};

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// Error during prefill execution
    /// TODO: Separate prefill worker error from prefill router error
    #[error("Prefill execution failed: {0}")]
    PrefillError(String),

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

46
/// The inner router used by PrefillRouter
47
#[derive(Clone)]
48
49
50
51
52
53
54
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

55
56
57
58
59
60
61
62
63
64
65
66
67
impl InnerPrefillRouter {
    /// Execute prefill generation through the underlying router
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        match self {
            InnerPrefillRouter::KvRouter(router) => router.generate(request).await,
            InnerPrefillRouter::SimpleRouter(router) => router.generate(request).await,
        }
    }
}

68
69
70
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
71
72
73
74
75
///
/// Supports regular Dynamo and GAIE integrated mode via query_instance_id state machine:
/// - GAIE Stage 1: query_instance_id transitions "" -> "prefill" -> "decode", returns only worker IDs
/// - GAIE Stage 2: target_prefill_worker_id/target_decode_worker_id are set, full execution with specified workers
/// - Non-GAIE: like GAIE Stage 2 but the worker ids have to be determined.
76
77
78
79
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
    cancel_token: CancellationToken,
    router_mode: RouterMode,
80
    enforce_disagg: bool,
81
82
83
84
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
85
    pub fn disabled(router_mode: RouterMode, enforce_disagg: bool) -> Arc<Self> {
86
87
88
89
        Arc::new(Self {
            prefill_router: OnceLock::new(),
            cancel_token: CancellationToken::new(),
            router_mode,
90
            enforce_disagg,
91
92
93
94
95
96
97
98
99
        })
    }

    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
100
        enforce_disagg: bool,
101
102
103
104
105
106
107
108
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
            cancel_token: cancel_token.clone(),
            router_mode,
109
            enforce_disagg,
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

        let inner_router = if self.router_mode.is_kv_routing() {
154
            // Create KV chooser using the endpoint
155
            let kv_chooser = model_manager
156
                .kv_chooser_for(&endpoint, kv_cache_block_size, kv_router_config)
157
158
                .await?;

159
160
161
162
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

            // Build the PushRouter for prefill with KV mode using the shared client
163
164
165
166
167
168
169
170
171
172
173
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
174
175
176
            // Create client for simple router
            let client = endpoint.client().await?;

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
            // Create simple push router with the frontend's router mode
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

200
201
202
203
204
    /// Generate a unique bootstrap room ID for disaggregated serving
    fn generate_bootstrap_room() -> u64 {
        rand::rng().random()
    }

205
206
207
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
    /// Otherwise, query for the best worker.
208
    async fn build_bootstrap_info(
209
        &self,
210
        req: &PreprocessedRequest,
211
        preselected_worker: Option<u64>,
212
213
214
215
216
217
218
    ) -> Option<(u64, u32, BootstrapInfo)> {
        let prefill_router = self.prefill_router.get()?;

        // Only works with KvRouter
        let kv_router = match prefill_router {
            InnerPrefillRouter::KvRouter(r) => r,
            InnerPrefillRouter::SimpleRouter(_) => return None,
219
220
        };

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
        // Use pre-selected worker (GAIE Stage 2) or query for best worker
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
            let dp_rank = req.dp_rank.unwrap_or(0);
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
        } else {
            match kv_router
                .chooser
                .find_best_match(None, &req.token_ids, None, false)
                .await
            {
                Ok((worker, _overlap)) => (worker.worker_id, worker.dp_rank),
                Err(_) => return None,
            }
239
240
        };

241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        // Look up bootstrap endpoint from discovery
        let endpoint = kv_router
            .chooser
            .get_disaggregated_endpoint(worker_id)
            .await?;
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

        let bootstrap_room = Self::generate_bootstrap_room();

        tracing::info!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

    /// Execute prefill with the given router and extract structured result
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
            .generate(request)
            .await
            .map_err(|e| PrefillError::PrefillError(e.to_string()))?;

282
        let Some(first_output) = prefill_response.next().await else {
283
284
285
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
            ));
286
287
        };

288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
304

305
        if let Some(err) = first_output.err() {
306
307
308
            return Err(PrefillError::PrefillError(format!(
                "Prefill router returned error in output: {err:?}"
            )));
309
310
311
        }

        let Some(output) = &first_output.data else {
312
313
314
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
315
316
317
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
318
319
320
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
321
322
        };

323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
        // Extract prefill worker ID from disaggregated_params
        let prefill_worker_id = disaggregated_params
            .get("worker_id")
            .and_then(|worker_id_json| {
                worker_id_json
                    .get("prefill_worker_id")
                    .and_then(|v| v.as_u64())
            });
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
            prefill_worker_id,
        ))
    }

    /// Spawn prefill as a background task
    fn spawn_prefill_task(&self, prefill_request: SingleIn<PreprocessedRequest>) {
        let router = self.prefill_router.get().cloned();

        tokio::spawn(async move {
            match Self::execute_prefill(router, prefill_request).await {
                Ok(_) => {
                    tracing::debug!("Prefill background task completed");
                }
                Err(e) => {
                    tracing::warn!("Prefill background task error: {e:?}");
                }
            }
        });
    }

    /// Call the prefill router and extract structured prefill result and worker ID
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        Self::execute_prefill(self.prefill_router.get().cloned(), request).await
362
363
364
    }
}

365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
/// GAIE helper functions for preparing prefill requests
impl PrefillRouter {
    /// Prepare prefill request for GAIE flows
    /// - Stage 1: Sets query_instance_id:prefill annotation
    /// - Stage 2: Sets backend_instance_id to target prefill worker
    fn prepare_prefill_for_gaie(prefill_req: &mut PreprocessedRequest, is_gaie_stage1: bool) {
        if is_gaie_stage1 {
            // GAIE Stage 1: Set query_instance_id to "prefill" for prefill worker selection
            prefill_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            prefill_req
                .annotations
                .push(format!("query_instance_id:{}", QueryInstanceType::Prefill));
        } else if let Some(prefill_worker_id) = prefill_req.target_prefill_worker_id {
            // GAIE Stage 2: Route to pre-selected prefill worker from the stage 1
            tracing::debug!(
                target_prefill_worker_id = prefill_worker_id,
                "GAIE Stage 2: Routing prefill to pre-selected worker"
            );
            prefill_req.backend_instance_id = Some(prefill_worker_id);
        }
    }

    /// Prepare decode request for GAIE Stage 1
    /// Extracts prefill_worker_id from prefill result and sets decode annotations
    fn prepare_decode_for_gaie_stage1(
        decode_req: &mut PreprocessedRequest,
        prefill_result: &PrefillResult,
    ) {
        let prefill_worker_id = prefill_result
            .disaggregated_params
            .get("worker_id")
            .and_then(|v| serde_json::from_value::<WorkerIdInfo>(v.clone()).ok())
            .and_then(|info| info.prefill_worker_id);

        if let Some(worker_id) = prefill_worker_id {
            decode_req
                .annotations
                .retain(|a| !a.starts_with("query_instance_id"));
            decode_req
                .annotations
                .push(format!("query_instance_id:{}", QueryInstanceType::Decode));
            decode_req
                .annotations
                .push(format!("prefill_worker_id:{worker_id}"));
        }
    }
}

415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
        let (req, context) = request.into_parts();
        let request_id = context.id().to_string();
439
        let engine_ctx = context.context();
440

441
442
443
444
445
446
        // GAIE Stage 1: the presence of the empty query_instance_id signals query-only mode
        // State machine: "" -> "prefill" -> "decode" (disagg) OR "" -> aggregated worker (agg fallback)
        let is_gaie_stage1 = req
            .get_annotation_value("query_instance_id")
            .is_some_and(|s| s.is_empty());

Yan Ru Pei's avatar
Yan Ru Pei committed
447
448
449
450
451
452
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

        // Prepare prefill request with max_tokens = 1
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
453

454
455
        // Prepare prefill request for GAIE flows (Stage 1 or Stage 2)
        Self::prepare_prefill_for_gaie(&mut prefill_req, is_gaie_stage1);
456

457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
        // Try build_bootstrap_info optimization (skip for GAIE Stage 1 which needs query-only flow)
        // For GAIE Stage 2, use target_prefill_worker_id if provided
        let preselected_worker = prefill_req.target_prefill_worker_id;
        let prefill_result = if !is_gaie_stage1 {
            if let Some((worker_id, dp_rank, bootstrap_info)) = self
                .build_bootstrap_info(&prefill_req, preselected_worker)
                .await
            {
                let bootstrap_room = bootstrap_info.bootstrap_room;

                // Prepare request with bootstrap_room and force routing to specific worker
                prefill_req.backend_instance_id = Some(worker_id);
                prefill_req.dp_rank = Some(dp_rank);
                let extra_args = prefill_req
                    .extra_args
                    .get_or_insert_with(|| serde_json::json!({}));
                if let Some(obj) = extra_args.as_object_mut() {
                    obj.insert(
                        "bootstrap_room".to_string(),
                        serde_json::json!(bootstrap_room),
                    );
                }
479

480
481
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
482

483
                self.spawn_prefill_task(prefill_context);
484

485
486
487
488
489
490
491
492
493
494
495
496
497
498
                Ok((None, Some(worker_id), Some(bootstrap_info)))
            } else {
                // Fallback to original: Wait for prefill to complete
                tracing::debug!("Using original prefill path");

                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());

                self.call_prefill(prefill_context)
                    .await
                    .map(|(result, worker_id)| (Some(result), worker_id, None))
            }
        } else {
            // GAIE Stage 1: Use original path (no bootstrap optimization)
499
500
            let prefill_context = Context::with_id(prefill_req, request_id.clone());
            engine_ctx.link_child(prefill_context.context());
501

502
503
504
505
            self.call_prefill(prefill_context)
                .await
                .map(|(result, worker_id)| (Some(result), worker_id, None))
        };
506
507
508
509
510
511
512
513
514
515
516
517

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
518
519
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
520
521

                let mut decode_req = req;
522

523
524
525
526
527
528
529
                // Update request with prefill result
                if is_gaie_stage1 {
                    if let Some(ref prefill_result) = maybe_prefill_result {
                        Self::prepare_decode_for_gaie_stage1(&mut decode_req, prefill_result);
                    }
                } else if let Some(prefill_result) = maybe_prefill_result {
                    // Normal or GAIE Stage 2: Set prefill_result for decode
530
531
532
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
533
534
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
535

536
537
538
539
540
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

541
542
543
544
545
546
547
                // Set router_config_override for decode: overlap_score_weight = 0
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
                    ..existing_override.unwrap_or_default()
                });

548
549
550
551
552
553
554
555
556
                // GAIE Stage 2: Route to pre-selected decode worker if specified
                if let Some(decode_worker_id) = decode_req.target_decode_worker_id {
                    decode_req.backend_instance_id = Some(decode_worker_id);
                    tracing::debug!(
                        decode_worker_id = decode_worker_id,
                        "GAIE Stage 2: Routing decode to pre-selected worker"
                    );
                }

557
                // Map the modified request through with preserved context
558
                let decode_request = context.map(|_| decode_req);
559
560
                next.generate(decode_request).await
            }
561
562
563
564
565
566
567
568
569
570
            Err(PrefillError::NotActivated) => {
                if self.enforce_disagg {
                    tracing::error!(
                        "Prefill router not activated, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
                tracing::debug!("Prefill router not activated, falling back to decode-only");
                next.generate(context.map(|_| req)).await
            }
571
            Err(e) => {
572
573
574
575
576
577
578
                if self.enforce_disagg {
                    tracing::error!(
                        error = %e,
                        "Remote prefill failed, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(e));
                }
579
580
581
582
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
583
584
585
586
587
                next.generate(context.map(|_| req)).await
            }
        }
    }
}