migration.rs 51.4 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use std::error::Error as StdError;
5
6
7
8
9
10
use std::sync::Arc;

use anyhow::{Error, Result};
use futures::{stream, stream::StreamExt};

use crate::{
11
    http::service::metrics::Metrics, model_card::ModelDeploymentCard, preprocessor::BackendOutput,
12
    protocols::common::llm_backend::PreprocessedRequest,
13
14
};

15
use dynamo_runtime::error::{self, BackendError, DynamoError, ErrorType};
16
17
use dynamo_runtime::pipeline::{
    AsyncEngineContext, AsyncEngineContextProvider, Context, ManyOut, Operator, ResponseStream,
18
    ServerStreamingEngine, SingleIn, async_trait,
19
};
20
use dynamo_runtime::protocols::{annotated::Annotated, maybe_error::MaybeError};
21

22
23
24
25
26
27
28
29
/// Check if an error chain indicates the request should be migrated.
fn is_migratable(err: &(dyn StdError + 'static)) -> bool {
    const MIGRATABLE: &[ErrorType] = &[
        ErrorType::CannotConnect,
        ErrorType::Disconnected,
        ErrorType::ConnectionTimeout,
        ErrorType::Backend(BackendError::EngineShutdown),
    ];
30
    const NON_MIGRATABLE: &[ErrorType] = &[ErrorType::Cancelled, ErrorType::ResourceExhausted];
31
32
33
    error::match_error_chain(err, MIGRATABLE, NON_MIGRATABLE)
}

34
35
pub struct Migration {
    migration_limit: u32,
36
    max_seq_len: Option<u32>,
37
38
    model_name: Arc<String>,
    metrics: Arc<Metrics>,
39
40
41
}

impl Migration {
42
43
44
45
46
47
48
49
50
51
52
53
    pub fn new(
        migration_limit: u32,
        max_seq_len: Option<u32>,
        model_name: String,
        metrics: Arc<Metrics>,
    ) -> Arc<Self> {
        tracing::debug!(
            "model {} migration limit {} max_seq_len {:?}",
            model_name,
            migration_limit,
            max_seq_len
        );
54
        Arc::new(Self {
55
            migration_limit,
56
            max_seq_len,
57
            model_name: Arc::new(model_name),
58
            metrics,
59
        })
60
    }
61
62
63
64

    pub fn from_mdc(
        mdc: &ModelDeploymentCard,
        migration_limit: u32,
65
        max_seq_len: Option<u32>,
66
67
        metrics: Arc<Metrics>,
    ) -> Arc<Self> {
68
69
70
71
72
73
        Self::new(
            migration_limit,
            max_seq_len,
            mdc.display_name.clone(),
            metrics,
        )
74
    }
75
76
77
78
79
80
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
81
        ManyOut<Annotated<BackendOutput>>,
82
        SingleIn<PreprocessedRequest>,
83
        ManyOut<Annotated<BackendOutput>>,
84
85
86
87
88
    > for Migration
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
89
90
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
    ) -> Result<ManyOut<Annotated<BackendOutput>>> {
91
92
        let (preprocessed_request, context) = request.transfer(());
        let engine_ctx = context.context();
93
        let engine_ctx_ = engine_ctx.clone();
94
95
96
97
98
        let retry_manager = RetryManager::build(
            engine_ctx,
            preprocessed_request,
            next,
            self.migration_limit,
99
            self.max_seq_len,
100
101
102
103
            self.model_name.clone(),
            self.metrics.clone(),
        )
        .await?;
104
105
106
107
108
        let response_stream = stream::unfold(retry_manager, move |mut retry_manager| async move {
            retry_manager
                .next()
                .await
                .map(|response| (response, retry_manager))
109
110
        })
        .fuse();
111
        Ok(ResponseStream::new(Box::pin(response_stream), engine_ctx_))
112
113
114
115
    }
}

struct RetryManager {
116
    context: Arc<dyn AsyncEngineContext>,
117
    request: PreprocessedRequest,
118
119
    next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
    next_stream: Option<ManyOut<Annotated<BackendOutput>>>,
120
    retries_left: u32,
121
    max_seq_len: Option<u32>,
122
123
    model_name: Arc<String>,
    metrics: Arc<Metrics>,
124
125
126
127
}

impl RetryManager {
    pub async fn build(
128
        context: Arc<dyn AsyncEngineContext>,
129
        preprocessed_request: PreprocessedRequest,
130
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
131
        mut retries_left: u32,
132
        max_seq_len: Option<u32>,
133
134
        model_name: Arc<String>,
        metrics: Arc<Metrics>,
135
    ) -> Result<Self> {
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        // Disable migration for structured-output (guided-decoding) requests.
        // Inference backends initialize the guided-decoding FSM (finite state machine) fresh
        // for every new request and only advance it on newly-generated tokens, not on
        // context/prompt tokens. Migrating a partial structured-output response would replay
        // already-generated tokens as context, causing the FSM to restart from the schema
        // root and producing duplicated or nested JSON. This applies to all backends
        // (vLLM, SGLang, TRT-LLM) equally. Propagate the error cleanly instead.
        if preprocessed_request
            .sampling_options
            .guided_decoding
            .is_some()
        {
            if retries_left > 0 {
                tracing::warn!(
                    "Guided-decoding request: migration disabled — FSM state is not transferable (applies to all backends)"
                );
            }
            retries_left = 0;
        }
155
        let mut slf = Self {
156
            context,
157
158
159
160
            request: preprocessed_request,
            next_generate: next,
            next_stream: None,
            retries_left: retries_left + 1, // +1 to account for the initial attempt
161
            max_seq_len,
162
163
            model_name,
            metrics,
164
165
        };
        slf.new_stream().await?;
166
        slf.exceed_max_seq_len(0); // disable migration if prompt len > max_seq_len
167
168
169
        Ok(slf)
    }

170
    pub async fn next(&mut self) -> Option<Annotated<BackendOutput>> {
171
172
173
174
175
        loop {
            let response_stream = match self.next_stream.as_mut() {
                Some(stream) => stream,
                None => {
                    tracing::error!("next() called with next_stream is None - should not happen");
176
                    return Some(Annotated::from_err(DynamoError::msg("next_stream is None")));
177
178
179
                }
            };
            if let Some(response) = response_stream.next().await {
180
                // Check if this is a migratable error that should trigger stream recreation.
181
                if let Some(err) = response.err()
182
                    && is_migratable(&err)
183
                {
184
                    tracing::warn!("Stream disconnected... recreating stream... {}", err);
185
                    self.metrics.inc_migration_ongoing_request(&self.model_name);
186
187
188
189
                    if let Err(err) = self.new_stream().await {
                        tracing::warn!("Cannot recreate stream: {:#}", err);
                    } else {
                        continue;
190
191
192
193
194
195
196
197
198
199
                    }
                }
                self.track_response(&response);
                return Some(response);
            }
            return None;
        }
    }

    async fn new_stream(&mut self) -> Result<()> {
200
        let mut response_stream: Option<Result<ManyOut<Annotated<BackendOutput>>>> = None;
201
202
        while self.retries_left > 0 {
            self.retries_left -= 1;
203
204
            let request = Context::with_id(self.request.clone(), self.context.id().to_string());
            self.context.link_child(request.context());
205
206
            if self.context.is_stopped() || self.context.is_killed() {
                tracing::debug!("Abort creating new stream after context is stopped or killed");
207
208
209
210
211
212
213
214
                return Err(DynamoError::builder()
                    .error_type(ErrorType::Cancelled)
                    .message(format!(
                        "Context id {} is stopped or killed",
                        self.context.id()
                    ))
                    .build()
                    .into());
215
            }
216
            response_stream = Some(self.next_generate.generate(request).await);
217
            if let Some(err) = response_stream.as_ref().unwrap().as_ref().err()
218
                && is_migratable(err.as_ref())
219
            {
220
                tracing::warn!("Creating new stream... retrying... {}", err);
221
                self.metrics.inc_migration_new_request(&self.model_name);
222
                continue;
223
224
225
226
227
228
229
230
            }
            break;
        }
        match response_stream {
            Some(Ok(next_stream)) => {
                self.next_stream = Some(next_stream);
                Ok(())
            }
231
            Some(Err(err)) => Err(err), // should propagate original error if any
232
            None => Err(Error::msg(
233
                "Migration limit exhausted", // should propagate original error if any
234
235
236
237
            )),
        }
    }

238
    fn track_response(&mut self, response: &Annotated<BackendOutput>) {
239
240
241
242
243
244
245
        if self.retries_left == 0 {
            return;
        }
        let llm_engine_output = match response.data.as_ref() {
            Some(output) => output,
            None => return,
        };
246
247
248
        if self.exceed_max_seq_len(llm_engine_output.token_ids.len() as u32) {
            return;
        }
249
250
251
252
        if let Some(max_tokens) = self.request.stop_conditions.max_tokens {
            self.request.stop_conditions.max_tokens =
                Some(max_tokens.saturating_sub(llm_engine_output.token_ids.len() as u32));
        }
253
254
255
256
        for token_id in llm_engine_output.token_ids.iter() {
            self.request.token_ids.push(*token_id);
        }
    }
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277

    /// Returns `true` if the tracked request token length plus `new_output_len`
    /// exceeds the configured max_seq_len, in which case migration is disabled.
    fn exceed_max_seq_len(&mut self, new_output_len: u32) -> bool {
        if let Some(max_seq_len) = self.max_seq_len {
            let total_len = self.request.token_ids.len() as u32 + new_output_len;
            if total_len > max_seq_len {
                tracing::warn!(
                    "Sequence length {} exceeds migration max_seq_len {}, \
                     disabling migration",
                    total_len,
                    max_seq_len
                );
                self.metrics
                    .inc_migration_max_seq_len_exceeded(&self.model_name);
                self.retries_left = 0; // disable migration
                return true;
            }
        }
        false
    }
278
279
280
281
282
}

#[cfg(test)]
mod tests {
    use super::*;
283
    use crate::http::service::metrics::Metrics;
284
285
286
    use crate::protocols::common::{
        GuidedDecodingOptions, OutputOptions, SamplingOptions, StopConditions,
    };
287
    use dynamo_runtime::error::{DynamoError, ErrorType};
288
    use dynamo_runtime::pipeline::AsyncEngine;
289
    use dynamo_runtime::pipeline::context::Controller;
290
291
292
    use std::sync::atomic::{AtomicU32, Ordering};
    use tokio::sync::mpsc;

293
294
    const TEST_MODEL: &str = "test-model";

295
    // Helper to create a mock preprocessed request
296
    fn create_mock_request(max_tokens: u32) -> PreprocessedRequest {
297
298
299
300
        PreprocessedRequest::builder()
            .model("mock".to_string())
            .token_ids(vec![1, 2, 3])
            .stop_conditions(StopConditions {
301
302
                max_tokens: Some(max_tokens),
                ..Default::default()
303
304
305
306
307
308
309
            })
            .sampling_options(SamplingOptions::default())
            .output_options(OutputOptions::default())
            .eos_token_ids(vec![])
            .annotations(vec![])
            .build()
            .unwrap()
310
311
312
    }

    // Helper to create mock LLM engine output
313
314
    fn create_mock_output(token_id: u32) -> Annotated<BackendOutput> {
        Annotated::from_data(BackendOutput {
315
            token_ids: vec![token_id],
316
317
            tokens: vec![],
            text: Some(format!("token_{token_id}")),
318
319
            cum_log_probs: None,
            log_probs: None,
Greg Clark's avatar
Greg Clark committed
320
            top_logprobs: None,
321
            finish_reason: None,
322
            stop_reason: None,
323
            index: None,
324
            disaggregated_params: None,
325
            completion_usage: None,
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
        })
    }

    #[derive(Debug, Clone)]
    enum MockBehavior {
        /// Always succeeds with all responses
        Success,
        /// Fails on first call with NoResponders error, then succeeds on subsequent calls
        FailThenSuccess,
        /// Succeeds initially, fails mid-stream with specific error, then succeeds on retry
        MidStreamFail { fail_after: usize },
        /// Succeeds initially, fails mid-stream with specific error, then always fails on retry attempts
        MidStreamFailAlways { fail_after: usize },
        /// Succeeds initially, fails mid-stream, then always fails with stream error on retry attempts
        MidStreamFailAlwaysStreamError { fail_after: usize },
        /// Always fails with NoResponders error (same as FailThenSuccess first call)
        AlwaysFail,
    }

    // Unified mock server streaming engine that can simulate different scenarios
    struct MockEngine {
        behavior: MockBehavior,
        num_responses: usize,
        token_offset: u32,
        call_count: Arc<AtomicU32>,
351
        context_id: String,
352
353
354
    }

    impl MockEngine {
355
356
357
358
359
360
        fn new(
            behavior: MockBehavior,
            num_responses: usize,
            token_offset: u32,
            context_id: String,
        ) -> Self {
361
362
363
364
365
            Self {
                behavior,
                num_responses,
                token_offset,
                call_count: Arc::new(AtomicU32::new(0)),
366
                context_id,
367
368
369
370
371
372
            }
        }
    }

    #[async_trait]
    impl
373
374
        AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<BackendOutput>>, anyhow::Error>
        for MockEngine
375
376
377
378
    {
        async fn generate(
            &self,
            request: SingleIn<PreprocessedRequest>,
379
        ) -> Result<ManyOut<Annotated<BackendOutput>>> {
380
            let call_num = self.call_count.fetch_add(1, Ordering::SeqCst);
381
382
383
384
385
386
387
388
            let (preprocessed_request, context) = request.transfer(());

            // Assert that the context_id matches the expected one
            assert_eq!(
                context.id().to_string(),
                self.context_id,
                "Context ID mismatch"
            );
389
390
391
392
393
394
395
396

            // Calculate how many responses we've already generated based on request token_ids
            // Initial request has [1, 2, 3], so anything beyond that are generated responses
            let initial_tokens = 3; // [1, 2, 3]
            let responses_already_generated = preprocessed_request
                .token_ids
                .len()
                .saturating_sub(initial_tokens);
397
398
399
400
401
402
403
404
405
406
407
408

            // Assert that max_tokens reflects the expected remaining tokens
            let expected_max_tokens =
                self.num_responses
                    .saturating_sub(responses_already_generated) as u32;
            assert_eq!(
                preprocessed_request.stop_conditions.max_tokens,
                Some(expected_max_tokens),
                "max_tokens should be {} but got {:?}",
                expected_max_tokens,
                preprocessed_request.stop_conditions.max_tokens
            );
409
410
411
412
413
414
415
416
417
418

            match &self.behavior {
                MockBehavior::Success => {
                    // Always succeed with remaining responses
                    self.send_responses(responses_already_generated, self.num_responses)
                        .await
                }
                MockBehavior::FailThenSuccess => {
                    if call_num == 0 {
                        // First call - return "No responders available" error to trigger retry
419
420
421
422
423
424
                        return Err(anyhow::anyhow!(
                            DynamoError::builder()
                                .error_type(ErrorType::CannotConnect)
                                .message("no responders")
                                .build()
                        ));
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
                    } else {
                        // Subsequent calls - succeed with remaining responses
                        self.send_responses(responses_already_generated, self.num_responses)
                            .await
                    }
                }
                MockBehavior::MidStreamFail { fail_after } => {
                    let (tx, rx) = mpsc::channel(1);
                    let token_offset = self.token_offset;
                    let fail_after = *fail_after;
                    let num_responses = self.num_responses;

                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
448
449
450
451
452
453
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
                            let _ = tx.send(error_response).await;
                        });
                    } else {
                        // Second call - send remaining responses from where we left off
                        tokio::spawn(async move {
                            for i in responses_already_generated..num_responses {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                        });
                    }

                    let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
469
                    let ctx = Arc::new(Controller::new(self.context_id.clone()));
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
                    Ok(dynamo_runtime::pipeline::ResponseStream::new(
                        Box::pin(stream),
                        ctx,
                    ))
                }
                MockBehavior::MidStreamFailAlways { fail_after } => {
                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        let (tx, rx) = mpsc::channel(1);
                        let token_offset = self.token_offset;
                        let fail_after = *fail_after;
                        let num_responses = self.num_responses;

                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
492
493
494
495
496
497
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
498
499
500
501
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
502
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
503
504
505
506
507
508
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    } else {
                        // Subsequent calls - always fail with NoResponders error (same as AlwaysFail)
509
510
511
512
513
514
                        Err(anyhow::anyhow!(
                            DynamoError::builder()
                                .error_type(ErrorType::CannotConnect)
                                .message("no responders")
                                .build()
                        ))
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
                    }
                }
                MockBehavior::MidStreamFailAlwaysStreamError { fail_after } => {
                    let (tx, rx) = mpsc::channel(1);
                    let token_offset = self.token_offset;
                    let fail_after = *fail_after;
                    let num_responses = self.num_responses;

                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
534
535
536
537
538
539
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
540
541
542
543
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
544
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
545
546
547
548
549
550
551
552
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    } else {
                        // Subsequent calls - immediately send stream error (no successful responses)
                        tokio::spawn(async move {
                            // Send the stream error immediately
553
554
555
556
557
558
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
559
560
561
562
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
563
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
564
565
566
567
568
569
570
571
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    }
                }
                MockBehavior::AlwaysFail => {
                    // Always fail with NoResponders error (same as FailThenSuccess first call)
572
573
574
575
576
577
                    Err(anyhow::anyhow!(
                        DynamoError::builder()
                            .error_type(ErrorType::CannotConnect)
                            .message("no responders")
                            .build()
                    ))
578
579
580
581
582
583
584
585
586
587
                }
            }
        }
    }

    impl MockEngine {
        async fn send_responses(
            &self,
            start: usize,
            end: usize,
588
        ) -> Result<ManyOut<Annotated<BackendOutput>>> {
589
590
591
592
593
594
595
596
597
598
599
600
601
            let (tx, rx) = mpsc::channel(1);
            let token_offset = self.token_offset;

            tokio::spawn(async move {
                for i in start..end {
                    let response = create_mock_output(token_offset + 1 + i as u32);
                    if tx.send(response).await.is_err() {
                        break;
                    }
                }
            });

            let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
602
            let ctx = Arc::new(Controller::new(self.context_id.clone()));
603
604
605
606
607
608
609
610
611
612
613
614
615
            Ok(dynamo_runtime::pipeline::ResponseStream::new(
                Box::pin(stream),
                ctx,
            ))
        }
    }

    /// Test case 1: No migration needed
    /// Tests the normal case where the RetryManager successfully processes all responses
    /// from a single stream without any failures or need for retries/migration.
    /// Expected behavior: All 10 responses should be received successfully.
    #[tokio::test]
    async fn test_retry_manager_no_migration() {
616
        dynamo_runtime::logging::init();
617
        let context_id = uuid::Uuid::new_v4().to_string();
618
        let request = create_mock_request(10);
619
620
621
622
623
624
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::Success,
            10,
            100,
            context_id.clone(),
        ));
625
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
626
627
            mock_engine;

628
        let ctx = Arc::new(Controller::new(context_id.clone()));
629
630
631
632
633
634
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            0,
635
            None,
636
637
638
639
640
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
641
642
643
644
645
646
647
648
649
650
651
652
653

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        assert_eq!(responses.len(), 10);
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
654
655
656

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
657
658
659
660
661
662
663
664
665
666
    }

    /// Test case 2: New request migration
    /// Tests the scenario where a worker becomes unreachable for new requests initially,
    /// triggering the RetryManager to retry the request. The MockEngine with FailThenSuccess
    /// fails on the first call with a "No responders available" error, then succeeds
    /// on subsequent calls, simulating a worker becoming available after initial failure.
    /// Expected behavior: All 10 responses should be received successfully after retry.
    #[tokio::test]
    async fn test_retry_manager_new_request_migration() {
667
        dynamo_runtime::logging::init();
668
        let context_id = uuid::Uuid::new_v4().to_string();
669
        let request = create_mock_request(10);
670
671
672
673
674
675
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::FailThenSuccess,
            10,
            100,
            context_id.clone(),
        ));
676
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
677
678
            mock_engine;

679
        let ctx = Arc::new(Controller::new(context_id.clone()));
680
681
682
683
684
685
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
686
            None,
687
688
689
690
691
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
692
693
694
695
696
697
698
699
700
701
702
703
704

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        assert_eq!(responses.len(), 10);
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
705
706
707

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 1);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
708
709
710
711
712
713
714
715
716
717
    }

    /// Test case 3: Ongoing request migration
    /// Tests the scenario where a worker fails mid-stream during an ongoing request.
    /// This simulates a connection being lost after partial response delivery, requiring
    /// the RetryManager to detect the failure (via "Stream ended before generation completed" error),
    /// create a new stream, and continue from where it left off.
    /// Expected behavior: 5 responses from first stream + 5 responses from retry stream = 10 total.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration() {
718
719
        dynamo_runtime::logging::init();

720
        let context_id = uuid::Uuid::new_v4().to_string();
721
        let request = create_mock_request(10);
722
723
724
725
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 5 },
            10,
            100,
726
            context_id.clone(),
727
        ));
728
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
729
730
            mock_engine;

731
        let ctx = Arc::new(Controller::new(context_id.clone()));
732
733
734
735
736
737
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
738
            None,
739
740
741
742
743
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received all 10 responses (5 from first stream + 5 from second stream)
        assert_eq!(responses.len(), 10);

        // Check that we received responses from both streams
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
760
761
762

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
763
764
765
766
767
768
769
770
    }

    /// Test case 4: New request migration - indefinite failure
    /// Tests the scenario where a worker becomes unreachable for new requests indefinitely.
    /// The RetryManager should exhaust all retries and return the original error from the first attempt.
    /// Expected behavior: Should receive an error after all retries are exhausted, with the original error.
    #[tokio::test]
    async fn test_retry_manager_new_request_migration_indefinite_failure() {
771
        dynamo_runtime::logging::init();
772
        let context_id = uuid::Uuid::new_v4().to_string();
773
        let request = create_mock_request(0);
774
775
776
777
778
779
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::AlwaysFail,
            0,
            100,
            context_id.clone(),
        ));
780
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
781
782
783
            mock_engine;

        // Should fail to build due to initial stream creation failure after exhausting all 3 retries
784
        let ctx = Arc::new(Controller::new(context_id.clone()));
785
786
787
788
789
790
        let metrics = Arc::new(Metrics::new());
        let retry_manager_result = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
791
            None,
792
793
794
795
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await;
796
797
798
799
800

        assert!(retry_manager_result.is_err());
        if let Err(error) = retry_manager_result {
            assert!(error.to_string().contains("no responders"));
        }
801
802
803

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 4); // 3 retries + 1 final failure
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
804
805
806
807
808
809
810
811
    }

    /// Test case 5: Ongoing request migration - indefinite failure
    /// Tests the scenario where a worker fails mid-stream indefinitely during ongoing requests.
    /// The RetryManager should exhaust all retries and return the original stream disconnection error.
    /// Expected behavior: Should receive some responses from first stream, then error after retries exhausted.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration_indefinite_failure() {
812
        dynamo_runtime::logging::init();
813
        let context_id = uuid::Uuid::new_v4().to_string();
814
        let request = create_mock_request(10);
815
816
817
818
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFailAlways { fail_after: 3 },
            10,
            100,
819
            context_id.clone(),
820
        ));
821
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
822
823
            mock_engine;

824
        let ctx = Arc::new(Controller::new(context_id.clone()));
825
826
827
828
829
830
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
831
            None,
832
833
834
835
836
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        ) // 3 retries
        .await
        .expect("Failed to build RetryManager");
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855

        let mut responses = Vec::new();

        // Collect all responses (both successful and error responses)
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received 4 total responses: 3 successful + 1 error
        assert_eq!(responses.len(), 4);

        // First 3 responses should be successful with tokens 101, 102, 103
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103
            }
        }

856
        // 4th response should be a Disconnected error after retries are exhausted
857
        let error_response = &responses[3];
858
859
        let err = error_response.err().expect("expected error response");
        assert_eq!(err.error_type(), ErrorType::Disconnected);
860
861
862

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 3); // 2 retries + 1 final failure
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1); // initial ongoing failure retry
863
864
865
866
867
868
869
870
    }

    /// Test case 6: Ongoing request migration - indefinite failure with stream errors
    /// Tests the scenario where a worker fails mid-stream indefinitely during ongoing requests,
    /// and all retry attempts also fail with stream errors instead of NATS errors.
    /// Expected behavior: Should receive some responses from first stream, then error after retries exhausted.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration_indefinite_failure_stream_error() {
871
        dynamo_runtime::logging::init();
872
        let context_id = uuid::Uuid::new_v4().to_string();
873
        let request = create_mock_request(10);
874
875
876
877
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFailAlwaysStreamError { fail_after: 3 },
            10,
            100,
878
            context_id.clone(),
879
        ));
880
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
881
882
            mock_engine;

883
        let ctx = Arc::new(Controller::new(context_id.clone()));
884
885
886
887
888
889
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
890
            None,
891
892
893
894
895
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        ) // 3 retries
        .await
        .expect("Failed to build RetryManager");
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914

        let mut responses = Vec::new();

        // Collect all responses (both successful and error responses)
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received 4 total responses: 3 successful + 1 error
        assert_eq!(responses.len(), 4);

        // First 3 responses should be successful with tokens 101, 102, 103
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103
            }
        }

915
        // 4th response should be a Disconnected error after retries are exhausted
916
        let error_response = &responses[3];
917
918
        let err = error_response.err().expect("expected error response");
        assert_eq!(err.error_type(), ErrorType::Disconnected);
919
920
921

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 4); // 3 retries + 1 final failure
922
    }
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938

    /// Test case 7: Request cancelled when creating new stream
    /// Tests the scenario where context.stop_generating() is called when creating a new stream.
    /// The RetryManager should detect that the context is stopped and abort creating new streams.
    /// Expected behavior: Should fail to build RetryManager with "Context is stopped or killed" error.
    #[tokio::test]
    async fn test_retry_manager_context_stopped_before_stream() {
        dynamo_runtime::logging::init();
        let context_id = uuid::Uuid::new_v4().to_string();
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::Success,
            10,
            100,
            context_id.clone(),
        ));
939
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
940
941
942
943
944
945
946
947
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));

        // Stop the context before building RetryManager
        ctx.stop_generating();

        // Should fail to build due to stopped context
948
949
950
951
952
953
        let metrics = Arc::new(Metrics::new());
        let retry_manager_result = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
954
            None,
955
956
957
958
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await;
959
960
961
962
963
964
965
966

        assert!(retry_manager_result.is_err());
        if let Err(error) = retry_manager_result {
            assert!(
                error
                    .to_string()
                    .contains(&format!("Context id {} is stopped or killed", context_id))
            );
967
968
969
970
971
972
973
974
975
            // Verify the error is a typed DynamoError with Cancelled type
            let dynamo_err = error
                .downcast_ref::<DynamoError>()
                .expect("Error should be a DynamoError");
            assert_eq!(
                dynamo_err.error_type(),
                ErrorType::Cancelled,
                "Stopped/killed context should produce a Cancelled error"
            );
976
        }
977
978
979

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
980
    }
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031

    /// Test case 8: No migration for guided-decoding (structured-output) requests
    ///
    /// Bug (#7634): When a worker crashes mid-stream during a structured-output
    /// (json_schema) request, migration appends already-generated token IDs back onto
    /// token_ids and replays the request to a new worker. However, backends initialize
    /// the guided-decoding FSM fresh for every new request and only advance it on newly-
    /// generated tokens — not on context/prompt tokens. This causes the FSM to restart
    /// from the schema root while treating already-generated tokens as context, producing
    /// duplicated or nested JSON in the final response.
    ///
    /// Fix: Disable migration for structured-output requests by zeroing retries_left in
    /// RetryManager::build() when guided_decoding is set, propagating the error cleanly.
    ///
    /// Expected behavior BEFORE fix: All 10 responses received (migration happened — wrong)
    /// Expected behavior AFTER fix: 3 successful + 1 error (migration blocked — correct)
    #[tokio::test]
    async fn test_retry_manager_no_migration_for_guided_decoding() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        let mut request = create_mock_request(10);
        // Set guided decoding (json_schema structured output) on the request
        request.sampling_options.guided_decoding = Some(GuidedDecodingOptions::new(
            Some(serde_json::json!({"type": "object", "properties": {"name": {"type": "string"}}})),
            None,
            None,
            None,
            None,
            None,
        ));

        // MidStreamFail after 3 tokens: without the fix, migration would succeed and
        // deliver all 10 responses; with the fix, migration is blocked and an error
        // is returned after the 3 partial responses.
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3, // migration_limit=3 — should be ignored for guided-decoding requests
1032
            None,
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Must receive 3 successful tokens + 1 Disconnected error, NOT all 10.
        // Before the fix this assertion fails because migration proceeds and returns 10.
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error response (migration must be blocked for \
             guided-decoding), but got {} responses",
            responses.len()
        );

        // First 3 responses should be successful
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(
                response.err().is_none(),
                "Response {} should be successful",
                i
            );
        }

        // Last response must be the stream-disconnection error
        let last = responses.last().unwrap();
        let err = last
            .err()
            .expect("Last response should be a Disconnected error");
        assert_eq!(
            err.error_type(),
            ErrorType::Disconnected,
            "Error type should be Disconnected"
        );
    }
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280

    /// Test case 9: max_seq_len exceeded limit + 1 disables migration
    ///
    /// Boundary test: prompt has 3 tokens, max_seq_len = 5. After 2 generated tokens the
    /// total is 5 (== max_seq_len) — still migratable. The 3rd generated token would push
    /// the total to 6 (> max_seq_len), which disables migration and stops caching.
    /// The failure is placed right at that point (fail_after: 3) so we see the error
    /// propagated instead of retried.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_exceeded() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 5.
        // Token 101 → total 4 ≤ 5: tracked.
        // Token 102 → total 5 ≤ 5: tracked.
        // Token 103 → would-be 6 > 5: NOT tracked, migration disabled.
        // Error follows immediately (fail_after: 3) → not retried.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(5), // prompt(3) + 3 generated = 6 > 5 → disables migration
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // 3 successful tokens + 1 Disconnected error (migration disabled at token 103).
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error (migration disabled by max_seq_len)"
        );

        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none(), "Response {} should be OK", i);
        }

        let err = responses[3]
            .err()
            .expect("Last response should be Disconnected error");
        assert_eq!(err.error_type(), ErrorType::Disconnected);

        // Migration was attempted but blocked because max_seq_len set retries_left to 0.
        // The ongoing metric is still incremented (it counts attempts, not successes).
        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
        // max_seq_len limit was triggered once (at token 103).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
    }

    /// Test case 10: Migration succeeds when sequence length is at max_seq_len
    ///
    /// Boundary test: prompt has 3 tokens, max_seq_len = 5. After 2 generated tokens
    /// the total is exactly 5 (== max_seq_len). The failure occurs at that point
    /// (fail_after: 2). Because we use strict inequality (> not >=), the request is
    /// still migratable and the retry succeeds.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_at_limit() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 5.
        // Token 101 → total 4 ≤ 5: tracked.
        // Token 102 → total 5 == 5: tracked (still migratable — strict >).
        // Error (fail_after: 2) → migration succeeds, retry delivers remaining tokens.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 2 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(5), // prompt(3) + 2 generated = 5 == max_seq_len → still migratable
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Migration succeeds — all 10 responses delivered
        assert_eq!(responses.len(), 10);
        for response in &responses {
            assert!(response.err().is_none());
        }

        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);

        // Tracked token_ids must equal exactly max_seq_len (5).
        // The 2 tokens from the first stream were tracked (prompt 3 + gen 2 = 5).
        // After migration the retry stream delivers remaining tokens, but the first
        // new token would push to 6 > 5, so tracking stops and no more are appended.
        assert_eq!(
            retry_manager.request.token_ids.len(),
            5,
            "tracked token_ids should be exactly max_seq_len"
        );

        // The limit was triggered once (first token of the retry stream exceeded 5).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
    }

    /// Test case 11: Prompt length alone exceeds max_seq_len
    ///
    /// When the prompt tokens already exceed max_seq_len, migration is disabled
    /// in RetryManager::build before any tokens are generated. A mid-stream
    /// failure should propagate the error without attempting migration.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_exceeded_by_prompt() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 2, so prompt alone exceeds the limit.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(2), // prompt(3) > max_seq_len(2) → migration disabled at build time
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // 3 successful tokens + 1 Disconnected error (migration disabled from the start).
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error (migration disabled by prompt exceeding max_seq_len)"
        );

        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none(), "Response {} should be OK", i);
        }

        let err = responses[3]
            .err()
            .expect("Last response should be Disconnected error");
        assert_eq!(err.error_type(), ErrorType::Disconnected);

        // max_seq_len was exceeded at build time (prompt len 3 > 2).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
        // Migration was attempted but blocked (retries_left was already 0).
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
    }
1281
}