migration.rs 50.8 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use std::error::Error as StdError;
5
6
7
8
9
10
use std::sync::Arc;

use anyhow::{Error, Result};
use futures::{stream, stream::StreamExt};

use crate::{
11
    http::service::metrics::Metrics, model_card::ModelDeploymentCard, preprocessor::BackendOutput,
12
    protocols::common::llm_backend::PreprocessedRequest,
13
14
};

15
use dynamo_runtime::error::{self, BackendError, DynamoError, ErrorType};
16
17
use dynamo_runtime::pipeline::{
    AsyncEngineContext, AsyncEngineContextProvider, Context, ManyOut, Operator, ResponseStream,
18
    ServerStreamingEngine, SingleIn, async_trait,
19
};
20
use dynamo_runtime::protocols::{annotated::Annotated, maybe_error::MaybeError};
21

22
23
24
25
26
27
28
29
/// Check if an error chain indicates the request should be migrated.
fn is_migratable(err: &(dyn StdError + 'static)) -> bool {
    const MIGRATABLE: &[ErrorType] = &[
        ErrorType::CannotConnect,
        ErrorType::Disconnected,
        ErrorType::ConnectionTimeout,
        ErrorType::Backend(BackendError::EngineShutdown),
    ];
30
    const NON_MIGRATABLE: &[ErrorType] = &[ErrorType::Cancelled, ErrorType::ResourceExhausted];
31
32
33
    error::match_error_chain(err, MIGRATABLE, NON_MIGRATABLE)
}

34
35
pub struct Migration {
    migration_limit: u32,
36
    max_seq_len: Option<u32>,
37
38
    model_name: Arc<String>,
    metrics: Arc<Metrics>,
39
40
41
}

impl Migration {
42
43
44
45
46
47
48
49
50
51
52
53
    pub fn new(
        migration_limit: u32,
        max_seq_len: Option<u32>,
        model_name: String,
        metrics: Arc<Metrics>,
    ) -> Arc<Self> {
        tracing::debug!(
            "model {} migration limit {} max_seq_len {:?}",
            model_name,
            migration_limit,
            max_seq_len
        );
54
        Arc::new(Self {
55
            migration_limit,
56
            max_seq_len,
57
            model_name: Arc::new(model_name),
58
            metrics,
59
        })
60
    }
61
62
63
64

    pub fn from_mdc(
        mdc: &ModelDeploymentCard,
        migration_limit: u32,
65
        max_seq_len: Option<u32>,
66
67
        metrics: Arc<Metrics>,
    ) -> Arc<Self> {
68
69
70
71
72
73
        Self::new(
            migration_limit,
            max_seq_len,
            mdc.display_name.clone(),
            metrics,
        )
74
    }
75
76
77
78
79
80
}

#[async_trait]
impl
    Operator<
        SingleIn<PreprocessedRequest>,
81
        ManyOut<Annotated<BackendOutput>>,
82
        SingleIn<PreprocessedRequest>,
83
        ManyOut<Annotated<BackendOutput>>,
84
85
86
87
88
    > for Migration
{
    async fn generate(
        &self,
        request: SingleIn<PreprocessedRequest>,
89
90
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
    ) -> Result<ManyOut<Annotated<BackendOutput>>> {
91
92
        let (preprocessed_request, context) = request.transfer(());
        let engine_ctx = context.context();
93
        let engine_ctx_ = engine_ctx.clone();
94
95
96
97
98
        let retry_manager = RetryManager::build(
            engine_ctx,
            preprocessed_request,
            next,
            self.migration_limit,
99
            self.max_seq_len,
100
101
102
103
            self.model_name.clone(),
            self.metrics.clone(),
        )
        .await?;
104
105
106
107
108
        let response_stream = stream::unfold(retry_manager, move |mut retry_manager| async move {
            retry_manager
                .next()
                .await
                .map(|response| (response, retry_manager))
109
110
        })
        .fuse();
111
        Ok(ResponseStream::new(Box::pin(response_stream), engine_ctx_))
112
113
114
115
    }
}

struct RetryManager {
116
    context: Arc<dyn AsyncEngineContext>,
117
    request: PreprocessedRequest,
118
119
    next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
    next_stream: Option<ManyOut<Annotated<BackendOutput>>>,
120
    retries_left: u32,
121
    max_seq_len: Option<u32>,
122
123
    model_name: Arc<String>,
    metrics: Arc<Metrics>,
124
125
126
127
}

impl RetryManager {
    pub async fn build(
128
        context: Arc<dyn AsyncEngineContext>,
129
        preprocessed_request: PreprocessedRequest,
130
        next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
131
        mut retries_left: u32,
132
        max_seq_len: Option<u32>,
133
134
        model_name: Arc<String>,
        metrics: Arc<Metrics>,
135
    ) -> Result<Self> {
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
        // Disable migration for structured-output (guided-decoding) requests.
        // Inference backends initialize the guided-decoding FSM (finite state machine) fresh
        // for every new request and only advance it on newly-generated tokens, not on
        // context/prompt tokens. Migrating a partial structured-output response would replay
        // already-generated tokens as context, causing the FSM to restart from the schema
        // root and producing duplicated or nested JSON. This applies to all backends
        // (vLLM, SGLang, TRT-LLM) equally. Propagate the error cleanly instead.
        if preprocessed_request
            .sampling_options
            .guided_decoding
            .is_some()
        {
            if retries_left > 0 {
                tracing::warn!(
                    "Guided-decoding request: migration disabled — FSM state is not transferable (applies to all backends)"
                );
            }
            retries_left = 0;
        }
155
        let mut slf = Self {
156
            context,
157
158
159
160
            request: preprocessed_request,
            next_generate: next,
            next_stream: None,
            retries_left: retries_left + 1, // +1 to account for the initial attempt
161
            max_seq_len,
162
163
            model_name,
            metrics,
164
165
        };
        slf.new_stream().await?;
166
        slf.exceed_max_seq_len(0); // disable migration if prompt len > max_seq_len
167
168
169
        Ok(slf)
    }

170
    pub async fn next(&mut self) -> Option<Annotated<BackendOutput>> {
171
172
173
174
175
        loop {
            let response_stream = match self.next_stream.as_mut() {
                Some(stream) => stream,
                None => {
                    tracing::error!("next() called with next_stream is None - should not happen");
176
                    return Some(Annotated::from_err(DynamoError::msg("next_stream is None")));
177
178
179
                }
            };
            if let Some(response) = response_stream.next().await {
180
                // Check if this is a migratable error that should trigger stream recreation.
181
                if let Some(err) = response.err()
182
                    && is_migratable(&err)
183
                {
184
                    tracing::warn!("Stream disconnected... recreating stream... {}", err);
185
                    self.metrics.inc_migration_ongoing_request(&self.model_name);
186
187
188
189
                    if let Err(err) = self.new_stream().await {
                        tracing::warn!("Cannot recreate stream: {:#}", err);
                    } else {
                        continue;
190
191
192
193
194
195
196
197
198
199
                    }
                }
                self.track_response(&response);
                return Some(response);
            }
            return None;
        }
    }

    async fn new_stream(&mut self) -> Result<()> {
200
        let mut response_stream: Option<Result<ManyOut<Annotated<BackendOutput>>>> = None;
201
202
        while self.retries_left > 0 {
            self.retries_left -= 1;
203
204
            let request = Context::with_id(self.request.clone(), self.context.id().to_string());
            self.context.link_child(request.context());
205
206
207
208
209
210
211
            if self.context.is_stopped() || self.context.is_killed() {
                tracing::debug!("Abort creating new stream after context is stopped or killed");
                return Err(Error::msg(format!(
                    "Context id {} is stopped or killed",
                    self.context.id()
                )));
            }
212
            response_stream = Some(self.next_generate.generate(request).await);
213
            if let Some(err) = response_stream.as_ref().unwrap().as_ref().err()
214
                && is_migratable(err.as_ref())
215
            {
216
                tracing::warn!("Creating new stream... retrying... {}", err);
217
                self.metrics.inc_migration_new_request(&self.model_name);
218
                continue;
219
220
221
222
223
224
225
226
            }
            break;
        }
        match response_stream {
            Some(Ok(next_stream)) => {
                self.next_stream = Some(next_stream);
                Ok(())
            }
227
            Some(Err(err)) => Err(err), // should propagate original error if any
228
            None => Err(Error::msg(
229
                "Migration limit exhausted", // should propagate original error if any
230
231
232
233
            )),
        }
    }

234
    fn track_response(&mut self, response: &Annotated<BackendOutput>) {
235
236
237
238
239
240
241
        if self.retries_left == 0 {
            return;
        }
        let llm_engine_output = match response.data.as_ref() {
            Some(output) => output,
            None => return,
        };
242
243
244
        if self.exceed_max_seq_len(llm_engine_output.token_ids.len() as u32) {
            return;
        }
245
246
247
248
        if let Some(max_tokens) = self.request.stop_conditions.max_tokens {
            self.request.stop_conditions.max_tokens =
                Some(max_tokens.saturating_sub(llm_engine_output.token_ids.len() as u32));
        }
249
250
251
252
        for token_id in llm_engine_output.token_ids.iter() {
            self.request.token_ids.push(*token_id);
        }
    }
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

    /// Returns `true` if the tracked request token length plus `new_output_len`
    /// exceeds the configured max_seq_len, in which case migration is disabled.
    fn exceed_max_seq_len(&mut self, new_output_len: u32) -> bool {
        if let Some(max_seq_len) = self.max_seq_len {
            let total_len = self.request.token_ids.len() as u32 + new_output_len;
            if total_len > max_seq_len {
                tracing::warn!(
                    "Sequence length {} exceeds migration max_seq_len {}, \
                     disabling migration",
                    total_len,
                    max_seq_len
                );
                self.metrics
                    .inc_migration_max_seq_len_exceeded(&self.model_name);
                self.retries_left = 0; // disable migration
                return true;
            }
        }
        false
    }
274
275
276
277
278
}

#[cfg(test)]
mod tests {
    use super::*;
279
    use crate::http::service::metrics::Metrics;
280
281
282
    use crate::protocols::common::{
        GuidedDecodingOptions, OutputOptions, SamplingOptions, StopConditions,
    };
283
    use dynamo_runtime::error::{DynamoError, ErrorType};
284
    use dynamo_runtime::pipeline::AsyncEngine;
285
    use dynamo_runtime::pipeline::context::Controller;
286
287
288
    use std::sync::atomic::{AtomicU32, Ordering};
    use tokio::sync::mpsc;

289
290
    const TEST_MODEL: &str = "test-model";

291
    // Helper to create a mock preprocessed request
292
    fn create_mock_request(max_tokens: u32) -> PreprocessedRequest {
293
294
295
296
        PreprocessedRequest::builder()
            .model("mock".to_string())
            .token_ids(vec![1, 2, 3])
            .stop_conditions(StopConditions {
297
298
                max_tokens: Some(max_tokens),
                ..Default::default()
299
300
301
302
303
304
305
            })
            .sampling_options(SamplingOptions::default())
            .output_options(OutputOptions::default())
            .eos_token_ids(vec![])
            .annotations(vec![])
            .build()
            .unwrap()
306
307
308
    }

    // Helper to create mock LLM engine output
309
310
    fn create_mock_output(token_id: u32) -> Annotated<BackendOutput> {
        Annotated::from_data(BackendOutput {
311
            token_ids: vec![token_id],
312
313
            tokens: vec![],
            text: Some(format!("token_{token_id}")),
314
315
            cum_log_probs: None,
            log_probs: None,
Greg Clark's avatar
Greg Clark committed
316
            top_logprobs: None,
317
            finish_reason: None,
318
            stop_reason: None,
319
            index: None,
320
            disaggregated_params: None,
321
            completion_usage: None,
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
        })
    }

    #[derive(Debug, Clone)]
    enum MockBehavior {
        /// Always succeeds with all responses
        Success,
        /// Fails on first call with NoResponders error, then succeeds on subsequent calls
        FailThenSuccess,
        /// Succeeds initially, fails mid-stream with specific error, then succeeds on retry
        MidStreamFail { fail_after: usize },
        /// Succeeds initially, fails mid-stream with specific error, then always fails on retry attempts
        MidStreamFailAlways { fail_after: usize },
        /// Succeeds initially, fails mid-stream, then always fails with stream error on retry attempts
        MidStreamFailAlwaysStreamError { fail_after: usize },
        /// Always fails with NoResponders error (same as FailThenSuccess first call)
        AlwaysFail,
    }

    // Unified mock server streaming engine that can simulate different scenarios
    struct MockEngine {
        behavior: MockBehavior,
        num_responses: usize,
        token_offset: u32,
        call_count: Arc<AtomicU32>,
347
        context_id: String,
348
349
350
    }

    impl MockEngine {
351
352
353
354
355
356
        fn new(
            behavior: MockBehavior,
            num_responses: usize,
            token_offset: u32,
            context_id: String,
        ) -> Self {
357
358
359
360
361
            Self {
                behavior,
                num_responses,
                token_offset,
                call_count: Arc::new(AtomicU32::new(0)),
362
                context_id,
363
364
365
366
367
368
            }
        }
    }

    #[async_trait]
    impl
369
370
        AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<BackendOutput>>, anyhow::Error>
        for MockEngine
371
372
373
374
    {
        async fn generate(
            &self,
            request: SingleIn<PreprocessedRequest>,
375
        ) -> Result<ManyOut<Annotated<BackendOutput>>> {
376
            let call_num = self.call_count.fetch_add(1, Ordering::SeqCst);
377
378
379
380
381
382
383
384
            let (preprocessed_request, context) = request.transfer(());

            // Assert that the context_id matches the expected one
            assert_eq!(
                context.id().to_string(),
                self.context_id,
                "Context ID mismatch"
            );
385
386
387
388
389
390
391
392

            // Calculate how many responses we've already generated based on request token_ids
            // Initial request has [1, 2, 3], so anything beyond that are generated responses
            let initial_tokens = 3; // [1, 2, 3]
            let responses_already_generated = preprocessed_request
                .token_ids
                .len()
                .saturating_sub(initial_tokens);
393
394
395
396
397
398
399
400
401
402
403
404

            // Assert that max_tokens reflects the expected remaining tokens
            let expected_max_tokens =
                self.num_responses
                    .saturating_sub(responses_already_generated) as u32;
            assert_eq!(
                preprocessed_request.stop_conditions.max_tokens,
                Some(expected_max_tokens),
                "max_tokens should be {} but got {:?}",
                expected_max_tokens,
                preprocessed_request.stop_conditions.max_tokens
            );
405
406
407
408
409
410
411
412
413
414

            match &self.behavior {
                MockBehavior::Success => {
                    // Always succeed with remaining responses
                    self.send_responses(responses_already_generated, self.num_responses)
                        .await
                }
                MockBehavior::FailThenSuccess => {
                    if call_num == 0 {
                        // First call - return "No responders available" error to trigger retry
415
416
417
418
419
420
                        return Err(anyhow::anyhow!(
                            DynamoError::builder()
                                .error_type(ErrorType::CannotConnect)
                                .message("no responders")
                                .build()
                        ));
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
                    } else {
                        // Subsequent calls - succeed with remaining responses
                        self.send_responses(responses_already_generated, self.num_responses)
                            .await
                    }
                }
                MockBehavior::MidStreamFail { fail_after } => {
                    let (tx, rx) = mpsc::channel(1);
                    let token_offset = self.token_offset;
                    let fail_after = *fail_after;
                    let num_responses = self.num_responses;

                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
444
445
446
447
448
449
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
                            let _ = tx.send(error_response).await;
                        });
                    } else {
                        // Second call - send remaining responses from where we left off
                        tokio::spawn(async move {
                            for i in responses_already_generated..num_responses {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                        });
                    }

                    let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
465
                    let ctx = Arc::new(Controller::new(self.context_id.clone()));
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
                    Ok(dynamo_runtime::pipeline::ResponseStream::new(
                        Box::pin(stream),
                        ctx,
                    ))
                }
                MockBehavior::MidStreamFailAlways { fail_after } => {
                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        let (tx, rx) = mpsc::channel(1);
                        let token_offset = self.token_offset;
                        let fail_after = *fail_after;
                        let num_responses = self.num_responses;

                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
488
489
490
491
492
493
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
494
495
496
497
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
498
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
499
500
501
502
503
504
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    } else {
                        // Subsequent calls - always fail with NoResponders error (same as AlwaysFail)
505
506
507
508
509
510
                        Err(anyhow::anyhow!(
                            DynamoError::builder()
                                .error_type(ErrorType::CannotConnect)
                                .message("no responders")
                                .build()
                        ))
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
                    }
                }
                MockBehavior::MidStreamFailAlwaysStreamError { fail_after } => {
                    let (tx, rx) = mpsc::channel(1);
                    let token_offset = self.token_offset;
                    let fail_after = *fail_after;
                    let num_responses = self.num_responses;

                    if call_num == 0 {
                        // First call - send some responses then an error to simulate disconnection
                        tokio::spawn(async move {
                            // Send responses from current position to fail_after
                            for i in responses_already_generated..fail_after.min(num_responses) {
                                let response = create_mock_output(token_offset + 1 + i as u32);
                                if tx.send(response).await.is_err() {
                                    break;
                                }
                            }
                            // Send the specific error that triggers retry logic
530
531
532
533
534
535
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
536
537
538
539
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
540
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
541
542
543
544
545
546
547
548
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    } else {
                        // Subsequent calls - immediately send stream error (no successful responses)
                        tokio::spawn(async move {
                            // Send the stream error immediately
549
550
551
552
553
554
                            let error_response = Annotated::from_err(
                                DynamoError::builder()
                                    .error_type(ErrorType::Disconnected)
                                    .message("Stream ended before generation completed")
                                    .build(),
                            );
555
556
557
558
                            let _ = tx.send(error_response).await;
                        });

                        let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
559
                        let ctx = Arc::new(Controller::new(self.context_id.clone()));
560
561
562
563
564
565
566
567
                        Ok(dynamo_runtime::pipeline::ResponseStream::new(
                            Box::pin(stream),
                            ctx,
                        ))
                    }
                }
                MockBehavior::AlwaysFail => {
                    // Always fail with NoResponders error (same as FailThenSuccess first call)
568
569
570
571
572
573
                    Err(anyhow::anyhow!(
                        DynamoError::builder()
                            .error_type(ErrorType::CannotConnect)
                            .message("no responders")
                            .build()
                    ))
574
575
576
577
578
579
580
581
582
583
                }
            }
        }
    }

    impl MockEngine {
        async fn send_responses(
            &self,
            start: usize,
            end: usize,
584
        ) -> Result<ManyOut<Annotated<BackendOutput>>> {
585
586
587
588
589
590
591
592
593
594
595
596
597
            let (tx, rx) = mpsc::channel(1);
            let token_offset = self.token_offset;

            tokio::spawn(async move {
                for i in start..end {
                    let response = create_mock_output(token_offset + 1 + i as u32);
                    if tx.send(response).await.is_err() {
                        break;
                    }
                }
            });

            let stream = tokio_stream::wrappers::ReceiverStream::new(rx);
598
            let ctx = Arc::new(Controller::new(self.context_id.clone()));
599
600
601
602
603
604
605
606
607
608
609
610
611
            Ok(dynamo_runtime::pipeline::ResponseStream::new(
                Box::pin(stream),
                ctx,
            ))
        }
    }

    /// Test case 1: No migration needed
    /// Tests the normal case where the RetryManager successfully processes all responses
    /// from a single stream without any failures or need for retries/migration.
    /// Expected behavior: All 10 responses should be received successfully.
    #[tokio::test]
    async fn test_retry_manager_no_migration() {
612
        dynamo_runtime::logging::init();
613
        let context_id = uuid::Uuid::new_v4().to_string();
614
        let request = create_mock_request(10);
615
616
617
618
619
620
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::Success,
            10,
            100,
            context_id.clone(),
        ));
621
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
622
623
            mock_engine;

624
        let ctx = Arc::new(Controller::new(context_id.clone()));
625
626
627
628
629
630
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            0,
631
            None,
632
633
634
635
636
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
637
638
639
640
641
642
643
644
645
646
647
648
649

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        assert_eq!(responses.len(), 10);
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
650
651
652

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
653
654
655
656
657
658
659
660
661
662
    }

    /// Test case 2: New request migration
    /// Tests the scenario where a worker becomes unreachable for new requests initially,
    /// triggering the RetryManager to retry the request. The MockEngine with FailThenSuccess
    /// fails on the first call with a "No responders available" error, then succeeds
    /// on subsequent calls, simulating a worker becoming available after initial failure.
    /// Expected behavior: All 10 responses should be received successfully after retry.
    #[tokio::test]
    async fn test_retry_manager_new_request_migration() {
663
        dynamo_runtime::logging::init();
664
        let context_id = uuid::Uuid::new_v4().to_string();
665
        let request = create_mock_request(10);
666
667
668
669
670
671
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::FailThenSuccess,
            10,
            100,
            context_id.clone(),
        ));
672
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
673
674
            mock_engine;

675
        let ctx = Arc::new(Controller::new(context_id.clone()));
676
677
678
679
680
681
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
682
            None,
683
684
685
686
687
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
688
689
690
691
692
693
694
695
696
697
698
699
700

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        assert_eq!(responses.len(), 10);
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
701
702
703

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 1);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
704
705
706
707
708
709
710
711
712
713
    }

    /// Test case 3: Ongoing request migration
    /// Tests the scenario where a worker fails mid-stream during an ongoing request.
    /// This simulates a connection being lost after partial response delivery, requiring
    /// the RetryManager to detect the failure (via "Stream ended before generation completed" error),
    /// create a new stream, and continue from where it left off.
    /// Expected behavior: 5 responses from first stream + 5 responses from retry stream = 10 total.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration() {
714
715
        dynamo_runtime::logging::init();

716
        let context_id = uuid::Uuid::new_v4().to_string();
717
        let request = create_mock_request(10);
718
719
720
721
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 5 },
            10,
            100,
722
            context_id.clone(),
723
        ));
724
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
725
726
            mock_engine;

727
        let ctx = Arc::new(Controller::new(context_id.clone()));
728
729
730
731
732
733
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
734
            None,
735
736
737
738
739
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received all 10 responses (5 from first stream + 5 from second stream)
        assert_eq!(responses.len(), 10);

        // Check that we received responses from both streams
        for (i, response) in responses.iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103, ..., 110
            }
        }
756
757
758

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
759
760
761
762
763
764
765
766
    }

    /// Test case 4: New request migration - indefinite failure
    /// Tests the scenario where a worker becomes unreachable for new requests indefinitely.
    /// The RetryManager should exhaust all retries and return the original error from the first attempt.
    /// Expected behavior: Should receive an error after all retries are exhausted, with the original error.
    #[tokio::test]
    async fn test_retry_manager_new_request_migration_indefinite_failure() {
767
        dynamo_runtime::logging::init();
768
        let context_id = uuid::Uuid::new_v4().to_string();
769
        let request = create_mock_request(0);
770
771
772
773
774
775
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::AlwaysFail,
            0,
            100,
            context_id.clone(),
        ));
776
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
777
778
779
            mock_engine;

        // Should fail to build due to initial stream creation failure after exhausting all 3 retries
780
        let ctx = Arc::new(Controller::new(context_id.clone()));
781
782
783
784
785
786
        let metrics = Arc::new(Metrics::new());
        let retry_manager_result = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
787
            None,
788
789
790
791
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await;
792
793
794
795
796

        assert!(retry_manager_result.is_err());
        if let Err(error) = retry_manager_result {
            assert!(error.to_string().contains("no responders"));
        }
797
798
799

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 4); // 3 retries + 1 final failure
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
800
801
802
803
804
805
806
807
    }

    /// Test case 5: Ongoing request migration - indefinite failure
    /// Tests the scenario where a worker fails mid-stream indefinitely during ongoing requests.
    /// The RetryManager should exhaust all retries and return the original stream disconnection error.
    /// Expected behavior: Should receive some responses from first stream, then error after retries exhausted.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration_indefinite_failure() {
808
        dynamo_runtime::logging::init();
809
        let context_id = uuid::Uuid::new_v4().to_string();
810
        let request = create_mock_request(10);
811
812
813
814
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFailAlways { fail_after: 3 },
            10,
            100,
815
            context_id.clone(),
816
        ));
817
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
818
819
            mock_engine;

820
        let ctx = Arc::new(Controller::new(context_id.clone()));
821
822
823
824
825
826
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
827
            None,
828
829
830
831
832
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        ) // 3 retries
        .await
        .expect("Failed to build RetryManager");
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851

        let mut responses = Vec::new();

        // Collect all responses (both successful and error responses)
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received 4 total responses: 3 successful + 1 error
        assert_eq!(responses.len(), 4);

        // First 3 responses should be successful with tokens 101, 102, 103
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103
            }
        }

852
        // 4th response should be a Disconnected error after retries are exhausted
853
        let error_response = &responses[3];
854
855
        let err = error_response.err().expect("expected error response");
        assert_eq!(err.error_type(), ErrorType::Disconnected);
856
857
858

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 3); // 2 retries + 1 final failure
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1); // initial ongoing failure retry
859
860
861
862
863
864
865
866
    }

    /// Test case 6: Ongoing request migration - indefinite failure with stream errors
    /// Tests the scenario where a worker fails mid-stream indefinitely during ongoing requests,
    /// and all retry attempts also fail with stream errors instead of NATS errors.
    /// Expected behavior: Should receive some responses from first stream, then error after retries exhausted.
    #[tokio::test]
    async fn test_retry_manager_ongoing_request_migration_indefinite_failure_stream_error() {
867
        dynamo_runtime::logging::init();
868
        let context_id = uuid::Uuid::new_v4().to_string();
869
        let request = create_mock_request(10);
870
871
872
873
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFailAlwaysStreamError { fail_after: 3 },
            10,
            100,
874
            context_id.clone(),
875
        ));
876
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
877
878
            mock_engine;

879
        let ctx = Arc::new(Controller::new(context_id.clone()));
880
881
882
883
884
885
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
886
            None,
887
888
889
890
891
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        ) // 3 retries
        .await
        .expect("Failed to build RetryManager");
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910

        let mut responses = Vec::new();

        // Collect all responses (both successful and error responses)
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Should have received 4 total responses: 3 successful + 1 error
        assert_eq!(responses.len(), 4);

        // First 3 responses should be successful with tokens 101, 102, 103
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none());
            if let Some(output) = &response.data {
                assert_eq!(output.token_ids, vec![101 + i as u32]); // 101, 102, 103
            }
        }

911
        // 4th response should be a Disconnected error after retries are exhausted
912
        let error_response = &responses[3];
913
914
        let err = error_response.err().expect("expected error response");
        assert_eq!(err.error_type(), ErrorType::Disconnected);
915
916
917

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 4); // 3 retries + 1 final failure
918
    }
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934

    /// Test case 7: Request cancelled when creating new stream
    /// Tests the scenario where context.stop_generating() is called when creating a new stream.
    /// The RetryManager should detect that the context is stopped and abort creating new streams.
    /// Expected behavior: Should fail to build RetryManager with "Context is stopped or killed" error.
    #[tokio::test]
    async fn test_retry_manager_context_stopped_before_stream() {
        dynamo_runtime::logging::init();
        let context_id = uuid::Uuid::new_v4().to_string();
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::Success,
            10,
            100,
            context_id.clone(),
        ));
935
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
936
937
938
939
940
941
942
943
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));

        // Stop the context before building RetryManager
        ctx.stop_generating();

        // Should fail to build due to stopped context
944
945
946
947
948
949
        let metrics = Arc::new(Metrics::new());
        let retry_manager_result = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
950
            None,
951
952
953
954
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await;
955
956
957
958
959
960
961
962
963

        assert!(retry_manager_result.is_err());
        if let Err(error) = retry_manager_result {
            assert!(
                error
                    .to_string()
                    .contains(&format!("Context id {} is stopped or killed", context_id))
            );
        }
964
965
966

        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
967
    }
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018

    /// Test case 8: No migration for guided-decoding (structured-output) requests
    ///
    /// Bug (#7634): When a worker crashes mid-stream during a structured-output
    /// (json_schema) request, migration appends already-generated token IDs back onto
    /// token_ids and replays the request to a new worker. However, backends initialize
    /// the guided-decoding FSM fresh for every new request and only advance it on newly-
    /// generated tokens — not on context/prompt tokens. This causes the FSM to restart
    /// from the schema root while treating already-generated tokens as context, producing
    /// duplicated or nested JSON in the final response.
    ///
    /// Fix: Disable migration for structured-output requests by zeroing retries_left in
    /// RetryManager::build() when guided_decoding is set, propagating the error cleanly.
    ///
    /// Expected behavior BEFORE fix: All 10 responses received (migration happened — wrong)
    /// Expected behavior AFTER fix: 3 successful + 1 error (migration blocked — correct)
    #[tokio::test]
    async fn test_retry_manager_no_migration_for_guided_decoding() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        let mut request = create_mock_request(10);
        // Set guided decoding (json_schema structured output) on the request
        request.sampling_options.guided_decoding = Some(GuidedDecodingOptions::new(
            Some(serde_json::json!({"type": "object", "properties": {"name": {"type": "string"}}})),
            None,
            None,
            None,
            None,
            None,
        ));

        // MidStreamFail after 3 tokens: without the fix, migration would succeed and
        // deliver all 10 responses; with the fix, migration is blocked and an error
        // is returned after the 3 partial responses.
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3, // migration_limit=3 — should be ignored for guided-decoding requests
1019
            None,
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Must receive 3 successful tokens + 1 Disconnected error, NOT all 10.
        // Before the fix this assertion fails because migration proceeds and returns 10.
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error response (migration must be blocked for \
             guided-decoding), but got {} responses",
            responses.len()
        );

        // First 3 responses should be successful
        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(
                response.err().is_none(),
                "Response {} should be successful",
                i
            );
        }

        // Last response must be the stream-disconnection error
        let last = responses.last().unwrap();
        let err = last
            .err()
            .expect("Last response should be a Disconnected error");
        assert_eq!(
            err.error_type(),
            ErrorType::Disconnected,
            "Error type should be Disconnected"
        );
    }
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267

    /// Test case 9: max_seq_len exceeded limit + 1 disables migration
    ///
    /// Boundary test: prompt has 3 tokens, max_seq_len = 5. After 2 generated tokens the
    /// total is 5 (== max_seq_len) — still migratable. The 3rd generated token would push
    /// the total to 6 (> max_seq_len), which disables migration and stops caching.
    /// The failure is placed right at that point (fail_after: 3) so we see the error
    /// propagated instead of retried.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_exceeded() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 5.
        // Token 101 → total 4 ≤ 5: tracked.
        // Token 102 → total 5 ≤ 5: tracked.
        // Token 103 → would-be 6 > 5: NOT tracked, migration disabled.
        // Error follows immediately (fail_after: 3) → not retried.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(5), // prompt(3) + 3 generated = 6 > 5 → disables migration
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // 3 successful tokens + 1 Disconnected error (migration disabled at token 103).
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error (migration disabled by max_seq_len)"
        );

        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none(), "Response {} should be OK", i);
        }

        let err = responses[3]
            .err()
            .expect("Last response should be Disconnected error");
        assert_eq!(err.error_type(), ErrorType::Disconnected);

        // Migration was attempted but blocked because max_seq_len set retries_left to 0.
        // The ongoing metric is still incremented (it counts attempts, not successes).
        assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
        // max_seq_len limit was triggered once (at token 103).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
    }

    /// Test case 10: Migration succeeds when sequence length is at max_seq_len
    ///
    /// Boundary test: prompt has 3 tokens, max_seq_len = 5. After 2 generated tokens
    /// the total is exactly 5 (== max_seq_len). The failure occurs at that point
    /// (fail_after: 2). Because we use strict inequality (> not >=), the request is
    /// still migratable and the retry succeeds.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_at_limit() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 5.
        // Token 101 → total 4 ≤ 5: tracked.
        // Token 102 → total 5 == 5: tracked (still migratable — strict >).
        // Error (fail_after: 2) → migration succeeds, retry delivers remaining tokens.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 2 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(5), // prompt(3) + 2 generated = 5 == max_seq_len → still migratable
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // Migration succeeds — all 10 responses delivered
        assert_eq!(responses.len(), 10);
        for response in &responses {
            assert!(response.err().is_none());
        }

        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);

        // Tracked token_ids must equal exactly max_seq_len (5).
        // The 2 tokens from the first stream were tracked (prompt 3 + gen 2 = 5).
        // After migration the retry stream delivers remaining tokens, but the first
        // new token would push to 6 > 5, so tracking stops and no more are appended.
        assert_eq!(
            retry_manager.request.token_ids.len(),
            5,
            "tracked token_ids should be exactly max_seq_len"
        );

        // The limit was triggered once (first token of the retry stream exceeded 5).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
    }

    /// Test case 11: Prompt length alone exceeds max_seq_len
    ///
    /// When the prompt tokens already exceed max_seq_len, migration is disabled
    /// in RetryManager::build before any tokens are generated. A mid-stream
    /// failure should propagate the error without attempting migration.
    #[tokio::test]
    async fn test_retry_manager_max_seq_len_exceeded_by_prompt() {
        dynamo_runtime::logging::init();

        let context_id = uuid::Uuid::new_v4().to_string();
        // Prompt = [1, 2, 3] (len 3). max_seq_len = 2, so prompt alone exceeds the limit.
        let request = create_mock_request(10);
        let mock_engine = Arc::new(MockEngine::new(
            MockBehavior::MidStreamFail { fail_after: 3 },
            10,
            100,
            context_id.clone(),
        ));
        let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
            mock_engine;

        let ctx = Arc::new(Controller::new(context_id.clone()));
        let metrics = Arc::new(Metrics::new());
        let mut retry_manager = RetryManager::build(
            ctx,
            request,
            next_generate,
            3,
            Some(2), // prompt(3) > max_seq_len(2) → migration disabled at build time
            Arc::new(TEST_MODEL.to_string()),
            metrics.clone(),
        )
        .await
        .expect("Failed to build RetryManager");

        let mut responses = Vec::new();
        while let Some(response) = retry_manager.next().await {
            responses.push(response);
        }

        // 3 successful tokens + 1 Disconnected error (migration disabled from the start).
        assert_eq!(
            responses.len(),
            4,
            "Expected 3 successful + 1 error (migration disabled by prompt exceeding max_seq_len)"
        );

        for (i, response) in responses[0..3].iter().enumerate() {
            assert!(response.err().is_none(), "Response {} should be OK", i);
        }

        let err = responses[3]
            .err()
            .expect("Last response should be Disconnected error");
        assert_eq!(err.error_type(), ErrorType::Disconnected);

        // max_seq_len was exceeded at build time (prompt len 3 > 2).
        assert_eq!(
            metrics.get_migration_max_seq_len_exceeded_count(TEST_MODEL),
            1
        );
        // Migration was attempted but blocked (retries_left was already 0).
        assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 1);
    }
1268
}