prefill_router.rs 23.5 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
// SPDX-License-Identifier: Apache-2.0

use std::sync::{Arc, OnceLock};

6
use anyhow::Result;
7
use futures::StreamExt;
8
use rand::Rng;
9
use tokio::sync::{OwnedSemaphorePermit, oneshot};
10
11
12
13
14
use tokio_util::sync::CancellationToken;

use dynamo_runtime::{
    component::Endpoint,
    pipeline::{
15
16
        AsyncEngine, AsyncEngineContextProvider, Context, ManyOut, Operator, PushRouter,
        RouterMode, ServerStreamingEngine, SingleIn, async_trait,
17
    },
18
    protocols::{EndpointId, annotated::Annotated, maybe_error::MaybeError},
19
20
21
22
};

use crate::{
    discovery::ModelManager,
23
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride},
24
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
25
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
26
    protocols::common::timing::{RequestPhase, RequestTracker},
27
28
};

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/// Errors that can occur during prefill routing
#[derive(Debug, thiserror::Error)]
pub enum PrefillError {
    /// Prefill router has not been activated yet
    #[error("Prefill router not yet activated")]
    NotActivated,

    /// Error during prefill execution
    /// TODO: Separate prefill worker error from prefill router error
    #[error("Prefill execution failed: {0}")]
    PrefillError(String),

    /// Disaggregated params not found in prefill response
    #[error("No disaggregated params in prefill response: {0}")]
    NoDisaggregatedParams(String),
}

46
/// The inner router used by PrefillRouter
47
#[derive(Clone)]
48
49
50
51
52
53
54
enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
}

55
impl InnerPrefillRouter {
56
57
58
59
    /// Generate with optional direct routing to specific worker.
    /// For KvRouter, target_worker is ignored since prefill_worker_id is already set on the request.
    /// For SimpleRouter, target_worker triggers direct routing via router.direct().
    async fn generate_to_worker(
60
61
        &self,
        request: SingleIn<PreprocessedRequest>,
62
        target_worker: Option<u64>,
63
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
64
65
66
67
68
69
70
71
72
73
74
75
        match (self, target_worker) {
            // KvRouter: prefill_worker_id already set on request, KvPushRouter::select_worker uses it
            (InnerPrefillRouter::KvRouter(router), _) => router.generate(request).await,
            (InnerPrefillRouter::SimpleRouter(router), Some(worker_id)) => {
                router.direct(request, worker_id).await
            }
            (InnerPrefillRouter::SimpleRouter(router), None) => router.generate(request).await,
        }
    }

    /// Select next worker (for non-KV modes only)
    fn select_next_worker(&self) -> Option<u64> {
76
        match self {
77
78
            InnerPrefillRouter::SimpleRouter(router) => router.select_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
79
80
        }
    }
81
82
83
84
85
86
87
88

    /// Peek next worker without incrementing state (for non-KV modes only)
    fn peek_next_worker(&self) -> Option<u64> {
        match self {
            InnerPrefillRouter::SimpleRouter(router) => router.peek_next_worker(),
            InnerPrefillRouter::KvRouter(_) => None,
        }
    }
89
90
}

91
92
93
/// PrefillRouter is a forward-only operator that sits between Migration and the decode router.
/// It optionally calls a prefill worker before routing to decode, extracting disaggregated_params
/// from the prefill response and injecting them into the decode request.
94
///
95
96
97
98
/// Modes:
/// - Query-only: `query_instance_id` annotation present → returns worker IDs without execution
/// - Pre-routed: `prefill_worker_id`/`decode_worker_id` set → routes to specified workers
/// - Normal: Worker IDs determined by router based on KV cache state
99
100
pub struct PrefillRouter {
    prefill_router: OnceLock<InnerPrefillRouter>,
101
102
    model_manager: Arc<ModelManager>,
    endpoint_id: OnceLock<EndpointId>,
103
104
    cancel_token: CancellationToken,
    router_mode: RouterMode,
105
    enforce_disagg: bool,
106
107
108
109
}

impl PrefillRouter {
    /// Create a disabled prefill router that will never activate (passthrough only)
110
111
112
113
114
    pub fn disabled(
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        enforce_disagg: bool,
    ) -> Arc<Self> {
115
116
        Arc::new(Self {
            prefill_router: OnceLock::new(),
117
118
            model_manager,
            endpoint_id: OnceLock::new(),
119
120
            cancel_token: CancellationToken::new(),
            router_mode,
121
            enforce_disagg,
122
123
124
125
126
127
128
129
130
        })
    }

    pub fn new(
        activation_rx: oneshot::Receiver<Endpoint>,
        model_manager: Arc<ModelManager>,
        router_mode: RouterMode,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
131
        enforce_disagg: bool,
132
133
134
135
136
137
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();

        let router = Arc::new(Self {
            prefill_router,
138
139
            model_manager: model_manager.clone(),
            endpoint_id: OnceLock::new(),
140
141
            cancel_token: cancel_token.clone(),
            router_mode,
142
            enforce_disagg,
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
        });

        // Spawn background task to wait for activation
        let router_clone = router.clone();
        tokio::spawn(async move {
            tokio::select! {
                result = activation_rx => {
                    let Ok(endpoint) = result else {
                        tracing::debug!("Prefill router activation channel closed without receiving endpoint");
                        return;
                    };

                    if let Err(e) = router_clone.activate(
                        endpoint,
                        model_manager,
                        kv_cache_block_size,
                        kv_router_config,
                    ).await {
                        tracing::error!(error = %e, "Failed to activate prefill router");
                    }
                }
                _ = cancel_token.cancelled() => {
                    tracing::debug!("Prefill router activation cancelled");
                }
            }
        });

        router
    }

    /// Activate the prefill router with the provided endpoint
    async fn activate(
        &self,
        endpoint: Endpoint,
        model_manager: Arc<ModelManager>,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
    ) -> Result<()> {
        tracing::info!(
            router_mode = ?self.router_mode,
            "Activating prefill router"
        );

186
187
188
189
190
191
192
193
194
        // Store endpoint_id for later use in build_bootstrap_info
        let _ = self.endpoint_id.set(endpoint.id());

        // Start runtime config watcher for this endpoint (needed for get_disaggregated_endpoint)
        // This must be done before creating the router so bootstrap info is available
        model_manager
            .get_or_create_runtime_config_watcher(&endpoint)
            .await?;

195
        let inner_router = if self.router_mode.is_kv_routing() {
196
            // Create KV chooser using the endpoint
197
            let kv_chooser = model_manager
198
                .kv_chooser_for(&endpoint, kv_cache_block_size, kv_router_config)
199
200
                .await?;

201
202
203
204
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();

            // Build the PushRouter for prefill with KV mode using the shared client
205
206
207
208
209
210
211
212
213
214
215
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                RouterMode::KV,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            // Wrap it in KvPushRouter
            InnerPrefillRouter::KvRouter(Arc::new(KvPushRouter::new(push_router, kv_chooser)))
        } else {
216
217
218
            // Create client for simple router
            let client = endpoint.client().await?;

219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
            // Create simple push router with the frontend's router mode
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

            InnerPrefillRouter::SimpleRouter(Arc::new(push_router))
        };

        // Set the router (ignore error if already set)
        let _ = self.prefill_router.set(inner_router);

        tracing::info!(
            router_mode = ?self.router_mode,
            "Prefill router activated successfully"
        );

        Ok(())
    }

242
243
    /// Build bootstrap_info for disaggregated serving
    /// If preselected_worker is provided (GAIE Stage 2), use it directly.
244
    /// Otherwise, query for the best worker (KV mode) or select next worker (non-KV modes).
245
    async fn build_bootstrap_info(
246
        &self,
247
        req: &PreprocessedRequest,
248
        preselected_worker: Option<u64>,
249
    ) -> Option<(u64, u32, BootstrapInfo)> {
250
        let endpoint_id = self.endpoint_id.get()?;
251
252
        let prefill_router = self.prefill_router.get()?;

253
        // Worker selection
254
        let (worker_id, dp_rank) = if let Some(id) = preselected_worker {
255
            let dp_rank = req.routing.as_ref().and_then(|r| r.dp_rank).unwrap_or(0);
256
257
258
259
260
261
            tracing::debug!(
                worker_id = id,
                dp_rank = dp_rank,
                "Using pre-selected prefill worker for bootstrap"
            );
            (id, dp_rank)
262
263
264
265
266
267
        } else if self.router_mode.is_kv_routing() {
            // KV mode: use find_best_match
            let kv_router = match prefill_router {
                InnerPrefillRouter::KvRouter(r) => r,
                _ => return None,
            };
268
269
270
271
272
273
274
275
            match kv_router
                .chooser
                .find_best_match(None, &req.token_ids, None, false)
                .await
            {
                Ok((worker, _overlap)) => (worker.worker_id, worker.dp_rank),
                Err(_) => return None,
            }
276
277
        } else {
            // Non-KV mode: use PushRouter's stateful selection
278
279
280
            // We use peek_next_worker instead of select_next_worker to avoid double-incrementing the counter
            // if we fall back to the original path.
            let worker_id = prefill_router.peek_next_worker()?;
281
            (worker_id, 0)
282
283
        };

284
285
286
287
        // Get bootstrap info from ModelManager (works for ANY mode)
        let endpoint = self
            .model_manager
            .get_disaggregated_endpoint(endpoint_id, worker_id)?;
288
289
290
        let host = endpoint.bootstrap_host?;
        let port = endpoint.bootstrap_port?;

291
        let bootstrap_room: u64 = rand::rng().random();
292
293
294
295
296
297
298

        tracing::info!(
            worker_id = worker_id,
            dp_rank = dp_rank,
            bootstrap_host = %host,
            bootstrap_port = port,
            bootstrap_room = bootstrap_room,
299
            router_mode = ?self.router_mode,
300
301
302
303
304
305
306
307
308
309
310
311
312
313
            "Built bootstrap_info upfront before prefill"
        );

        Some((
            worker_id,
            dp_rank,
            BootstrapInfo {
                bootstrap_host: host,
                bootstrap_port: port,
                bootstrap_room,
            },
        ))
    }

314
315
316
317
318
319
320
    /// Execute prefill with the given router and extract structured result.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
    /// optimization path to ensure `record_worker` completes before the phase changes.
321
322
323
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
324
        target_worker: Option<u64>,
325
        phase_permit: Option<OwnedSemaphorePermit>,
326
327
328
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
329
            .generate_to_worker(request, target_worker)
330
331
332
            .await
            .map_err(|e| PrefillError::PrefillError(e.to_string()))?;

333
334
335
336
        // Drop phase permit now - routing is complete, record_worker was called in select_worker.
        // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
        drop(phase_permit);

337
        let Some(first_output) = prefill_response.next().await else {
338
339
340
            return Err(PrefillError::PrefillError(
                "Prefill router returned no output (stream ended)".to_string(),
            ));
341
342
        };

343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        let mut prompt_tokens_details = first_output
            .data
            .as_ref()
            .and_then(|o| o.completion_usage.as_ref())
            .and_then(|u| u.prompt_tokens_details.clone());

        while let Some(next) = prefill_response.next().await {
            if let Some(o) = next.data.as_ref()
                && prompt_tokens_details.is_none()
            {
                prompt_tokens_details = o
                    .completion_usage
                    .as_ref()
                    .and_then(|u| u.prompt_tokens_details.clone());
            }
        }
359

360
        if let Some(err) = first_output.err() {
361
362
363
            return Err(PrefillError::PrefillError(format!(
                "Prefill router returned error in output: {err:?}"
            )));
364
365
366
        }

        let Some(output) = &first_output.data else {
367
368
369
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output has no data field".to_string(),
            ));
370
371
372
        };

        let Some(disaggregated_params) = output.disaggregated_params.clone() else {
373
374
375
            return Err(PrefillError::NoDisaggregatedParams(
                "Prefill router output missing disaggregated_params".to_string(),
            ));
376
377
        };

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
        // Extract prefill worker ID from disaggregated_params
        let prefill_worker_id = disaggregated_params
            .get("worker_id")
            .and_then(|worker_id_json| {
                worker_id_json
                    .get("prefill_worker_id")
                    .and_then(|v| v.as_u64())
            });
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
            prefill_worker_id,
        ))
    }

395
396
397
398
399
400
    /// Spawn prefill as a background task.
    ///
    /// Uses direct routing to target_worker when specified (for non-KV modes with bootstrap optimization).
    ///
    /// The `phase_permit` is passed to the spawned task and dropped after the first output,
    /// allowing the main task's `set_phase(Decode)` to proceed.
401
402
403
404
    fn spawn_prefill_task(
        &self,
        prefill_request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
405
        phase_permit: OwnedSemaphorePermit,
406
    ) {
407
408
409
        let router = self.prefill_router.get().cloned();

        tokio::spawn(async move {
410
411
412
            match Self::execute_prefill(router, prefill_request, target_worker, Some(phase_permit))
                .await
            {
413
414
415
416
417
418
419
420
421
422
                Ok(_) => {
                    tracing::debug!("Prefill background task completed");
                }
                Err(e) => {
                    tracing::warn!("Prefill background task error: {e:?}");
                }
            }
        });
    }

423
424
425
426
    /// Call the prefill router and extract structured prefill result and worker ID.
    ///
    /// This is the synchronous prefill path - we wait for prefill to complete before proceeding.
    /// No phase permit is needed since `record_worker` completes before we return.
427
428
429
430
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
431
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
432
433
        // No phase permit needed - we wait for completion before changing phase
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None, None).await
434
435
436
    }
}

437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
impl Drop for PrefillRouter {
    fn drop(&mut self) {
        tracing::debug!("Dropping PrefillRouter, cancelling background activation task");
        self.cancel_token.cancel();
    }
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
        SingleIn<PreprocessedRequest>,
        ManyOut<Annotated<LLMEngineOutput>>,
    > for PrefillRouter
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<LLMEngineOutput>>,
    ) -> Result<ManyOut<Annotated<LLMEngineOutput>>> {
        // Extract request data while preserving context
459
        let (mut req, context) = request.into_parts();
460
        let request_id = context.id().to_string();
461
        let engine_ctx = context.context();
462

Yan Ru Pei's avatar
Yan Ru Pei committed
463
464
465
        // Save original max_tokens for decode
        let original_max_tokens = req.stop_conditions.max_tokens;

466
467
        // If prefill router is not activated, skip directly to decode
        if self.prefill_router.get().is_none() {
468
469
470
471
472
473
474
475
476
477
478
479
            if self.enforce_disagg {
                return Err(anyhow::anyhow!(PrefillError::NotActivated));
            }
            return next.generate(context.map(|_| req)).await;
        }

        // Ensure tracker exists for routing decisions in disaggregated mode.
        // Create one if not provided by the upstream DeltaGenerator.
        if req.tracker.is_none() {
            req.tracker = Some(Arc::new(RequestTracker::new()));
        }
        let tracker = req.tracker.as_ref().unwrap();
480
        let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
481
482
483
        tracker.record_prefill_start();

        // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
Yan Ru Pei's avatar
Yan Ru Pei committed
484
485
        let mut prefill_req = req.clone();
        prefill_req.stop_conditions.max_tokens = Some(1);
486

487
488
        // Try build_bootstrap_info optimization: if we can get bootstrap info upfront,
        // spawn prefill in background and proceed to decode immediately.
489
490
491
492
        let preselected_worker = prefill_req
            .routing
            .as_ref()
            .and_then(|r| r.prefill_worker_id);
493

494
495
496
        let prefill_result = if let Some((worker_id, dp_rank, bootstrap_info)) = self
            .build_bootstrap_info(&prefill_req, preselected_worker)
            .await
497
498
        {
            // Bootstrap optimization path: spawn prefill in background
499
500
501
502
503
504
505
506
            // We successfully used the peeked worker, so we must now advance the router state
            // to ensure the next request gets a different worker.
            if !self.router_mode.is_kv_routing()
                && let Some(router) = self.prefill_router.get()
            {
                router.select_next_worker();
            }

507
508
509
510
            let routing = prefill_req.routing_mut();
            routing.prefill_worker_id = Some(worker_id);
            routing.dp_rank = Some(dp_rank);
            prefill_req.bootstrap_info = Some(bootstrap_info.clone());
511

512
513
            let prefill_context = Context::with_id(prefill_req, request_id.clone());
            engine_ctx.link_child(prefill_context.context());
514

515
516
517
            // Pass phase permit to spawned task - it drops after first output (record_worker complete)
            // This allows set_phase(Decode) below to proceed only after prefill routing is done
            self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
518

519
            Ok((None, Some(worker_id), Some(bootstrap_info)))
520
        } else {
521
            // Original prefill path: wait for prefill to complete
522
523
524
525
526
            tracing::debug!("Using original prefill path");

            // Drop the phase permit before calling call_prefill - we wait for completion
            // so there's no race with set_phase(Decode) below
            drop(prefill_phase_permit);
527

528
529
            let prefill_context = Context::with_id(prefill_req, request_id.clone());
            engine_ctx.link_child(prefill_context.context());
530

531
532
533
            self.call_prefill(prefill_context)
                .await
                .map(|(result, worker_id)| (Some(result), worker_id, None))
534
        };
535
536
537
538
539
540
541
542
543
544
545
546

        // Abort if cancelled during prefill
        if engine_ctx.is_stopped() || engine_ctx.is_killed() {
            tracing::debug!("Abort entering decode after context is stopped or killed");
            return Err(anyhow::anyhow!(
                "Context id {} is stopped or killed",
                engine_ctx.id()
            ));
        }

        // Handle prefill result
        match prefill_result {
547
548
            Ok((maybe_prefill_result, _prefill_worker_id, bootstrap_info)) => {
                tracing::debug!("Prefill completed, proceeding to decode");
549

550
551
552
                // Set phase to Decode for the decode request.
                // In bootstrap path, this blocks until the spawned prefill task drops its permit
                // (after first output / record_worker completes), ensuring correct phase for routing.
553
                if let Some(ref tracker) = req.tracker {
554
555
                    let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
                    // Permit is dropped immediately - decode proceeds, no need to hold it
556
557
                }

558
                let mut decode_req = req;
559

560
                // Update request with prefill result
561
                if let Some(prefill_result) = maybe_prefill_result {
562
563
564
                    decode_req.prefill_result = Some(prefill_result);
                }

Yan Ru Pei's avatar
Yan Ru Pei committed
565
566
                // Restore original max_tokens for decode
                decode_req.stop_conditions.max_tokens = original_max_tokens;
567

568
569
570
571
572
                // Inject bootstrap_info for decode worker
                if let Some(info) = bootstrap_info {
                    decode_req.bootstrap_info = Some(info);
                }

573
574
575
576
577
578
579
580
                // Set router_config_override for decode: overlap_score_weight = 0
                let existing_override = decode_req.router_config_override.take();
                decode_req.router_config_override = Some(RouterConfigOverride {
                    overlap_score_weight: Some(0.0),
                    ..existing_override.unwrap_or_default()
                });

                // Map the modified request through with preserved context
581
                let decode_request = context.map(|_| decode_req);
582
583
                next.generate(decode_request).await
            }
584
585
586
587
588
589
590
591
592
593
            Err(PrefillError::NotActivated) => {
                if self.enforce_disagg {
                    tracing::error!(
                        "Prefill router not activated, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(PrefillError::NotActivated));
                }
                tracing::debug!("Prefill router not activated, falling back to decode-only");
                next.generate(context.map(|_| req)).await
            }
594
            Err(e) => {
595
596
597
598
599
600
601
                if self.enforce_disagg {
                    tracing::error!(
                        error = %e,
                        "Remote prefill failed, but disaggregated mode is enforced. Failing request."
                    );
                    return Err(anyhow::anyhow!(e));
                }
602
603
604
605
                tracing::warn!(
                    error = %e,
                    "Remote prefill failed, falling back to decode-only. This may impact performance in disaggregated deployments. Verify prefill workers are healthy and accessible."
                );
606
607
608
609
610
                next.generate(context.map(|_| req)).await
            }
        }
    }
}