Unverified Commit c53ef736 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix(migration): disable migration for guided-decoding requests (#7930)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 88966f02
...@@ -132,6 +132,18 @@ These metrics can be used to: ...@@ -132,6 +132,18 @@ These metrics can be used to:
For more information on Dynamo metrics, see the [Metrics documentation](../observability/metrics.md). For more information on Dynamo metrics, see the [Metrics documentation](../observability/metrics.md).
## Known Limitations
### Guided Decoding (Structured Output)
Request migration is **not supported** for requests that use guided decoding (structured output / JSON schema). When a worker fails mid-stream during a guided-decoding request, the error is propagated to the client instead of attempting migration.
**Why:** Inference backends initialize the guided-decoding finite state machine (FSM) fresh for every new request and only advance it on newly-generated tokens, not on context/prompt tokens. When a partially-completed request is migrated to a new worker, the new worker replays the already-generated tokens as context but starts the FSM from the schema root. This mismatch between the token state and FSM state produces corrupted output — typically duplicated or nested JSON.
This limitation applies equally to all backends (vLLM, SGLang, TRT-LLM).
**Future path:** Supporting migration for guided-decoding requests would require serializing and restoring the FSM state across workers, or replaying prior output tokens through the FSM on the new worker. This is tracked as a future enhancement.
## Operational Impact ## Operational Impact
Request migration fundamentally changes how the system handles failures, moving from a "fail-fast" approach to a "graceful degradation" model. This architectural shift enables higher availability and better resource utilization while maintaining the same external API contract for clients. Request migration fundamentally changes how the system handles failures, moving from a "fail-fast" approach to a "graceful degradation" model. This architectural shift enables higher availability and better resource utilization while maintaining the same external API contract for clients.
...@@ -108,10 +108,29 @@ impl RetryManager { ...@@ -108,10 +108,29 @@ impl RetryManager {
context: Arc<dyn AsyncEngineContext>, context: Arc<dyn AsyncEngineContext>,
preprocessed_request: PreprocessedRequest, preprocessed_request: PreprocessedRequest,
next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>, next: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>>,
retries_left: u32, mut retries_left: u32,
model_name: Arc<String>, model_name: Arc<String>,
metrics: Arc<Metrics>, metrics: Arc<Metrics>,
) -> Result<Self> { ) -> Result<Self> {
// Disable migration for structured-output (guided-decoding) requests.
// Inference backends initialize the guided-decoding FSM (finite state machine) fresh
// for every new request and only advance it on newly-generated tokens, not on
// context/prompt tokens. Migrating a partial structured-output response would replay
// already-generated tokens as context, causing the FSM to restart from the schema
// root and producing duplicated or nested JSON. This applies to all backends
// (vLLM, SGLang, TRT-LLM) equally. Propagate the error cleanly instead.
if preprocessed_request
.sampling_options
.guided_decoding
.is_some()
{
if retries_left > 0 {
tracing::warn!(
"Guided-decoding request: migration disabled — FSM state is not transferable (applies to all backends)"
);
}
retries_left = 0;
}
let mut slf = Self { let mut slf = Self {
context, context,
request: preprocessed_request, request: preprocessed_request,
...@@ -211,7 +230,9 @@ impl RetryManager { ...@@ -211,7 +230,9 @@ impl RetryManager {
mod tests { mod tests {
use super::*; use super::*;
use crate::http::service::metrics::Metrics; use crate::http::service::metrics::Metrics;
use crate::protocols::common::{OutputOptions, SamplingOptions, StopConditions}; use crate::protocols::common::{
GuidedDecodingOptions, OutputOptions, SamplingOptions, StopConditions,
};
use dynamo_runtime::error::{DynamoError, ErrorType}; use dynamo_runtime::error::{DynamoError, ErrorType};
use dynamo_runtime::pipeline::AsyncEngine; use dynamo_runtime::pipeline::AsyncEngine;
use dynamo_runtime::pipeline::context::Controller; use dynamo_runtime::pipeline::context::Controller;
...@@ -890,4 +911,96 @@ mod tests { ...@@ -890,4 +911,96 @@ mod tests {
assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0); assert_eq!(metrics.get_migration_new_request_count(TEST_MODEL), 0);
assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0); assert_eq!(metrics.get_migration_ongoing_request_count(TEST_MODEL), 0);
} }
/// Test case 8: No migration for guided-decoding (structured-output) requests
///
/// Bug (#7634): When a worker crashes mid-stream during a structured-output
/// (json_schema) request, migration appends already-generated token IDs back onto
/// token_ids and replays the request to a new worker. However, backends initialize
/// the guided-decoding FSM fresh for every new request and only advance it on newly-
/// generated tokens — not on context/prompt tokens. This causes the FSM to restart
/// from the schema root while treating already-generated tokens as context, producing
/// duplicated or nested JSON in the final response.
///
/// Fix: Disable migration for structured-output requests by zeroing retries_left in
/// RetryManager::build() when guided_decoding is set, propagating the error cleanly.
///
/// Expected behavior BEFORE fix: All 10 responses received (migration happened — wrong)
/// Expected behavior AFTER fix: 3 successful + 1 error (migration blocked — correct)
#[tokio::test]
async fn test_retry_manager_no_migration_for_guided_decoding() {
dynamo_runtime::logging::init();
let context_id = uuid::Uuid::new_v4().to_string();
let mut request = create_mock_request(10);
// Set guided decoding (json_schema structured output) on the request
request.sampling_options.guided_decoding = Some(GuidedDecodingOptions::new(
Some(serde_json::json!({"type": "object", "properties": {"name": {"type": "string"}}})),
None,
None,
None,
None,
None,
));
// MidStreamFail after 3 tokens: without the fix, migration would succeed and
// deliver all 10 responses; with the fix, migration is blocked and an error
// is returned after the 3 partial responses.
let mock_engine = Arc::new(MockEngine::new(
MockBehavior::MidStreamFail { fail_after: 3 },
10,
100,
context_id.clone(),
));
let next_generate: ServerStreamingEngine<PreprocessedRequest, Annotated<BackendOutput>> =
mock_engine;
let ctx = Arc::new(Controller::new(context_id.clone()));
let metrics = Arc::new(Metrics::new());
let mut retry_manager = RetryManager::build(
ctx,
request,
next_generate,
3, // migration_limit=3 — should be ignored for guided-decoding requests
Arc::new(TEST_MODEL.to_string()),
metrics.clone(),
)
.await
.expect("Failed to build RetryManager");
let mut responses = Vec::new();
while let Some(response) = retry_manager.next().await {
responses.push(response);
}
// Must receive 3 successful tokens + 1 Disconnected error, NOT all 10.
// Before the fix this assertion fails because migration proceeds and returns 10.
assert_eq!(
responses.len(),
4,
"Expected 3 successful + 1 error response (migration must be blocked for \
guided-decoding), but got {} responses",
responses.len()
);
// First 3 responses should be successful
for (i, response) in responses[0..3].iter().enumerate() {
assert!(
response.err().is_none(),
"Response {} should be successful",
i
);
}
// Last response must be the stream-disconnection error
let last = responses.last().unwrap();
let err = last
.err()
.expect("Last response should be a Disconnected error");
assert_eq!(
err.error_type(),
ErrorType::Disconnected,
"Error type should be Disconnected"
);
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment