feat: unified internal request representation for lossless API conversion (#7202)

Signed-off-by: Matej Kosec <mkosec@nvidia.com> Signed-off-by: Marko Kosec <mkosec@nvidia.com>

feat: unified internal request representation for lossless API conversion (#7202)
Signed-off-by: Matej Kosec <mkosec@nvidia.com> Signed-off-by: Marko Kosec <mkosec@nvidia.com>
3bfee568 · MatejKosec · GitHub · 8fe2082c · 3bfee568 · 3bfee568
Unverified Commit 3bfee568 authored Apr 01, 2026 by MatejKosec Committed by GitHub Apr 01, 2026
9 changed files
--- a/lib/llm/src/http/service/anthropic.rs
+++ b/lib/llm/src/http/service/anthropic.rs
@@ -40,9 +40,10 @@ use crate::protocols::anthropic::types::{
    chat_completion_to_anthropic_response,
 };
 use crate::protocols::openai::chat_completions::{
-    NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
-    NvCreateChatCompletionStreamResponse, aggregator::ChatCompletionAggregator,
+    NvCreateChatCompletionResponse, NvCreateChatCompletionStreamResponse,
+    aggregator::ChatCompletionAggregator,
 };
+use crate::protocols::unified::UnifiedRequest;
 use crate::request_template::RequestTemplate;
 use crate::types::Annotated;

@@ -213,20 +214,25 @@ async fn anthropic_messages(
        .as_ref()
        .is_some_and(|t| t.thinking_type == "disabled");

-    // Convert Anthropic request -> Chat Completion request
-    let mut chat_request: NvCreateChatCompletionRequest =
-        orig_request.try_into().map_err(|e: anyhow::Error| {
-            tracing::error!(
-                request_id,
-                error = %e,
-                "Failed to convert AnthropicCreateMessageRequest to NvCreateChatCompletionRequest",
-            );
-            anthropic_error(
-                StatusCode::BAD_REQUEST,
-                "invalid_request_error",
-                &format!("Failed to convert request: {}", e),
-            )
-        })?;
+    // Convert Anthropic request -> UnifiedRequest -> Chat Completion request
+    let unified_request: UnifiedRequest = orig_request.try_into().map_err(|e: anyhow::Error| {
+        tracing::error!(
+            request_id,
+            error = %e,
+            "Failed to convert AnthropicCreateMessageRequest to UnifiedRequest",
+        );
+        anthropic_error(
+            StatusCode::BAD_REQUEST,
+            "invalid_request_error",
+            &format!("Failed to convert request: {}", e),
+        )
+    })?;
+
+    // Extract the API context before consuming the UnifiedRequest — this
+    // carries Anthropic-specific fields (thinking config, cache breakpoints,
+    // etc.) that the stream converter needs for faithful response reconstruction.
+    let anthropic_ctx = unified_request.anthropic_context().cloned();
+    let mut chat_request = unified_request.into_inner();

    // When a reasoning parser is configured and the client hasn't explicitly
    // disabled thinking, assume the model's chat template will inject `<think>`.
@@ -309,7 +315,10 @@ async fn anthropic_messages(

        use std::sync::atomic::{AtomicBool, Ordering};

-        let mut converter = AnthropicStreamConverter::new(model_for_resp);
+        let mut converter = match anthropic_ctx {
+            Some(ctx) => AnthropicStreamConverter::with_context(model_for_resp, ctx),
+            None => AnthropicStreamConverter::new(model_for_resp),
+        };
        let start_events = converter.emit_start_events();

        let converter = std::sync::Arc::new(std::sync::Mutex::new(converter));
@@ -406,7 +415,11 @@ async fn anthropic_messages(
                    )
                })?;

-        let response = chat_completion_to_anthropic_response(chat_response, &model_for_resp);
+        let response = chat_completion_to_anthropic_response(
+            chat_response,
+            &model_for_resp,
+            anthropic_ctx.as_ref(),
+        );

        inflight_guard.mark_ok();


--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -57,6 +57,7 @@ use crate::protocols::openai::{
    responses::{NvCreateResponse, NvResponse, ResponseParams, chat_completion_to_response},
    videos::{NvCreateVideoRequest, NvVideosResponse},
 };
+use crate::protocols::unified::UnifiedRequest;
 use crate::request_template::RequestTemplate;
 use crate::types::Annotated;
 use dynamo_runtime::logging::get_distributed_tracing_context;
@@ -1513,21 +1514,25 @@ async fn responses(
    let request_id = request.id().to_string();
    let (orig_request, context) = request.into_parts();

-    let mut chat_request: NvCreateChatCompletionRequest =
-        orig_request.try_into().map_err(|e: anyhow::Error| {
-            tracing::error!(
-                request_id,
-                error = %e,
-                "Failed to convert NvCreateResponse to NvCreateChatCompletionRequest",
-            );
-            let err_response = ErrorMessage::not_implemented_error(
-                VALIDATION_PREFIX.to_string()
-                    + "Failed to convert responses request: "
-                    + &e.to_string(),
-            );
-            inflight_guard.mark_error(extract_error_type_from_response(&err_response));
-            err_response
-        })?;
+    let unified_request: UnifiedRequest = orig_request.try_into().map_err(|e: anyhow::Error| {
+        tracing::error!(
+            request_id,
+            error = %e,
+            "Failed to convert NvCreateResponse to UnifiedRequest",
+        );
+        let err_response = ErrorMessage::not_implemented_error(
+            VALIDATION_PREFIX.to_string()
+                + "Failed to convert responses request: "
+                + &e.to_string(),
+        );
+        inflight_guard.mark_error(extract_error_type_from_response(&err_response));
+        err_response
+    })?;
+    // Extract the API context before consuming the UnifiedRequest — this
+    // carries Responses-specific fields (previous_response_id, store, etc.)
+    // that the stream converter needs for faithful response reconstruction.
+    let responses_ctx = unified_request.responses_context().cloned();
+    let mut chat_request = unified_request.into_inner();

    // Always use internal streaming for aggregation.
    // Set stream_options.include_usage so the backend sends token counts in the final chunk.
@@ -1577,7 +1582,10 @@ async fn responses(
        use crate::protocols::openai::responses::stream_converter::ResponseStreamConverter;
        use std::sync::atomic::{AtomicBool, Ordering};

-        let mut converter = ResponseStreamConverter::new(model.clone(), response_params);
+        let mut converter = match responses_ctx {
+            Some(ctx) => ResponseStreamConverter::with_context(model.clone(), response_params, ctx),
+            None => ResponseStreamConverter::new(model.clone(), response_params),
+        };
        let start_events = converter.emit_start_events();

        // Use std::sync::Mutex (not tokio) since process_chunk/emit_end_events are
@@ -1685,18 +1693,19 @@ async fn responses(
                })?;

        // Convert NvCreateChatCompletionResponse --> NvResponse
-        let response: NvResponse = chat_completion_to_response(response, &response_params)
-            .map_err(|e| {
-                tracing::error!(
-                    request_id,
-                    "Failed to convert NvCreateChatCompletionResponse to NvResponse: {:?}",
-                    e
-                );
-                let err_response =
-                    ErrorMessage::internal_server_error("Failed to convert internal response");
-                inflight_guard.mark_error(extract_error_type_from_response(&err_response));
-                err_response
-            })?;
+        let response: NvResponse =
+            chat_completion_to_response(response, &response_params, responses_ctx.as_ref())
+                .map_err(|e| {
+                    tracing::error!(
+                        request_id,
+                        "Failed to convert NvCreateChatCompletionResponse to NvResponse: {:?}",
+                        e
+                    );
+                    let err_response =
+                        ErrorMessage::internal_server_error("Failed to convert internal response");
+                    inflight_guard.mark_error(extract_error_type_from_response(&err_response));
+                    err_response
+                })?;

        inflight_guard.mark_ok();
        // If the engine context was killed (client disconnect), the response was

--- a/lib/llm/src/protocols.rs
+++ b/lib/llm/src/protocols.rs
@@ -15,6 +15,7 @@ pub mod codec;
 pub mod common;
 pub mod openai;
 pub mod tensor;
+pub(crate) mod unified;

 /// The token ID type
 pub type TokenIdType = u32;

--- a/lib/llm/src/protocols/anthropic/stream_converter.rs
+++ b/lib/llm/src/protocols/anthropic/stream_converter.rs
@@ -18,11 +18,14 @@ use super::types::{
    AnthropicResponseContentBlock, AnthropicStopReason, AnthropicStreamEvent, AnthropicUsage,
 };
 use crate::protocols::openai::chat_completions::NvCreateChatCompletionStreamResponse;
+use crate::protocols::unified::AnthropicContext;

 /// State machine that converts a chat completion stream into Anthropic SSE events.
 pub struct AnthropicStreamConverter {
    model: String,
    message_id: String,
+    /// Preserved Anthropic-specific request context for faithful response reconstruction.
+    api_context: Option<AnthropicContext>,
    // Thinking/reasoning tracking
    thinking_block_started: bool,
    thinking_block_closed: bool,
@@ -60,6 +63,7 @@ impl AnthropicStreamConverter {
        Self {
            model,
            message_id: format!("msg_{}", Uuid::new_v4().simple()),
+            api_context: None,
            thinking_block_started: false,
            thinking_block_closed: false,
            thinking_block_index: 0,
@@ -76,8 +80,19 @@ impl AnthropicStreamConverter {
        }
    }

+    /// Create a converter seeded with the original Anthropic request context.
+    /// This allows the response stream to carry forward metadata that was lost
+    /// during the Anthropic-to-OpenAI request conversion.
+    pub fn with_context(model: String, context: AnthropicContext) -> Self {
+        let mut converter = Self::new(model);
+        converter.api_context = Some(context);
+        converter
+    }
+
    /// Emit the initial `message_start` event.
    pub fn emit_start_events(&mut self) -> Vec<Result<Event, anyhow::Error>> {
+        // TODO: When AnthropicMessageResponse gains a `service_tier` field,
+        // populate it from `self.api_context` (if the original request specified one).
        let message = AnthropicMessageResponse {
            id: self.message_id.clone(),
            object_type: "message".to_string(),
@@ -182,6 +197,11 @@ impl AnthropicStreamConverter {
                    // Emit signature delta to close the thinking block.
                    // The engine doesn't produce Anthropic-style cryptographic signatures,
                    // so we use "erased" (the standard placeholder per the Anthropic spec).
+                    // When `api_context` is available and the original request had
+                    // `thinking.thinking_type == "enabled"`, this is expected — the backend
+                    // simply doesn't generate real signatures. If/when the backend starts
+                    // returning real signatures, we can use the context to validate or
+                    // pass them through instead of hardcoding "erased".
                    let sig_delta = AnthropicStreamEvent::ContentBlockDelta {
                        index: self.thinking_block_index,
                        delta: AnthropicDelta::SignatureDelta {
@@ -1071,4 +1091,35 @@ mod tests {
            "no block stops in end events"
        );
    }
+
+    /// Verify that `with_context` stores the context and produces the same
+    /// event structure as `new` — the context is carried for future enrichment.
+    #[test]
+    fn test_with_context_preserves_context() {
+        use crate::protocols::unified::AnthropicContext;
+
+        let ctx = AnthropicContext {
+            service_tier: Some("priority".to_string()),
+            ..Default::default()
+        };
+        let mut conv = AnthropicStreamConverter::with_context("test-model".into(), ctx);
+        assert!(conv.api_context.is_some());
+        assert_eq!(
+            conv.api_context.as_ref().unwrap().service_tier.as_deref(),
+            Some("priority")
+        );
+
+        // Should produce the same events as a regular converter
+        let ev = conv.process_chunk_tagged(&text_chunk("Hello"));
+        assert_eq!(
+            event_types(&ev),
+            vec!["content_block_start", "content_block_delta"]
+        );
+
+        let end = conv.emit_end_events_tagged();
+        assert_eq!(
+            event_types(&end),
+            vec!["content_block_stop", "message_delta", "message_stop"]
+        );
+    }
 }
--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
@@ -120,7 +120,10 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
                ..Default::default()
            },
            nvext: {
-                // Collect per-block cache_control: use the last one found
+                // Lossy: collapse all per-block cache_control into a single
+                // last-one-wins value. Sufficient for backends with a single
+                // prefix cache boundary. Full per-block breakpoints are
+                // preserved in AnthropicContext::cache_breakpoints via UnifiedRequest.
                let mut last_block_cc: Option<CacheControl> = None;
                for msg in &req.messages {
                    if let AnthropicMessageContent::Blocks { content } = &msg.content {
@@ -472,7 +475,9 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
 pub fn chat_completion_to_anthropic_response(
    chat_resp: NvCreateChatCompletionResponse,
    model: &str,
+    api_context: Option<&crate::protocols::unified::AnthropicContext>,
 ) -> AnthropicMessageResponse {
+    let _ = api_context; // Available for future enrichment (service_tier, etc.)
    let msg_id = format!("msg_{}", Uuid::new_v4().simple());

    let choice = chat_resp.inner.choices.into_iter().next();
@@ -853,7 +858,7 @@ mod tests {
            nvext: None,
        };

-        let response = chat_completion_to_anthropic_response(chat_resp, "test-model");
+        let response = chat_completion_to_anthropic_response(chat_resp, "test-model", None);
        assert!(response.id.starts_with("msg_"));
        assert_eq!(response.object_type, "message");
        assert_eq!(response.role, "assistant");

--- a/lib/llm/src/protocols/openai.rs
+++ b/lib/llm/src/protocols/openai.rs
@@ -37,7 +37,7 @@ pub struct AnnotatedDelta<R> {
    pub comment: Option<String>,
 }

-trait OpenAISamplingOptionsProvider {
+pub(crate) trait OpenAISamplingOptionsProvider {
    fn get_temperature(&self) -> Option<f32>;

    fn get_top_p(&self) -> Option<f32>;
@@ -55,7 +55,7 @@ trait OpenAISamplingOptionsProvider {
    fn nvext(&self) -> Option<&nvext::NvExt>;
 }

-trait OpenAIStopConditionsProvider {
+pub(crate) trait OpenAIStopConditionsProvider {
    fn get_max_tokens(&self) -> Option<u32>;

    fn get_min_tokens(&self) -> Option<u32>;
@@ -82,7 +82,7 @@ trait OpenAIStopConditionsProvider {
    }
 }

-trait OpenAIOutputOptionsProvider {
+pub(crate) trait OpenAIOutputOptionsProvider {
    fn get_logprobs(&self) -> Option<u32>;

    fn get_prompt_logprobs(&self) -> Option<u32>;

--- a/lib/llm/src/protocols/openai/responses/mod.rs
+++ b/lib/llm/src/protocols/openai/responses/mod.rs
@@ -695,6 +695,7 @@ fn make_function_call(name: String, arguments: String) -> OutputItem {
 pub fn chat_completion_to_response(
    nv_resp: NvCreateChatCompletionResponse,
    params: &ResponseParams,
+    api_context: Option<&crate::protocols::unified::ResponsesContext>,
 ) -> Result<NvResponse, anyhow::Error> {
    let nvext = nv_resp.nvext.clone();
    let chat_resp = nv_resp.inner;
@@ -814,7 +815,10 @@ pub fn chat_completion_to_response(
        presence_penalty: Some(0.0),
        // Echo actual request values, falling back to spec defaults.
        // store: false because this branch does not persist responses.
-        store: params.store.or(Some(false)),
+        store: api_context
+            .map(|ctx| ctx.store)
+            .or(params.store)
+            .or(Some(false)),
        temperature: params.temperature.or(Some(1.0)),
        text: Some(params.text.clone().unwrap_or(ResponseTextParam {
            format: TextResponseFormatConfiguration::Text,
@@ -841,7 +845,7 @@ pub fn chat_completion_to_response(
        instructions: params.instructions.clone().map(Instructions::Text),
        max_output_tokens: params.max_output_tokens,
        max_tool_calls: None,
-        previous_response_id: None,
+        previous_response_id: api_context.and_then(|ctx| ctx.previous_response_id.clone()),
        prompt: None,
        prompt_cache_key: None,
        prompt_cache_retention: None,
@@ -1194,7 +1198,8 @@ mod tests {
            nvext: None,
        };

-        let wrapped = chat_completion_to_response(chat_resp, &ResponseParams::default()).unwrap();
+        let wrapped =
+            chat_completion_to_response(chat_resp, &ResponseParams::default(), None).unwrap();

        assert_eq!(wrapped.inner.model, "llama-3.1-8b-instruct");
        assert_eq!(wrapped.inner.status, Status::Completed);
@@ -1254,7 +1259,8 @@ mod tests {
            nvext: None,
        };

-        let wrapped = chat_completion_to_response(chat_resp, &ResponseParams::default()).unwrap();
+        let wrapped =
+            chat_completion_to_response(chat_resp, &ResponseParams::default(), None).unwrap();
        assert_eq!(wrapped.inner.output.len(), 1);
        match &wrapped.inner.output[0] {
            OutputItem::FunctionCall(fc) => {
@@ -1449,7 +1455,7 @@ thinking
            nvext: None,
        };

-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
        let reasoning = resp.inner.reasoning.unwrap();
        assert_eq!(reasoning.effort, Some(ReasoningEffort::High));
    }
@@ -1482,7 +1488,7 @@ thinking
            nvext: None,
        };

-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
        let text = resp.inner.text.unwrap();
        assert_eq!(text.format, TextResponseFormatConfiguration::JsonObject);
    }
@@ -1510,7 +1516,7 @@ thinking
            nvext: None,
        };

-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
        assert_eq!(resp.inner.service_tier, Some(ServiceTier::Flex));
    }

@@ -1598,7 +1604,7 @@ thinking
    fn test_include_logprobs_stripped_by_default() {
        let chat_resp = make_chat_resp_with_text("hello");
        let params = ResponseParams::default();
-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();

        for item in &resp.inner.output {
            if let OutputItem::Message(msg) = item {
@@ -1623,7 +1629,7 @@ thinking
            include: Some(vec![IncludeEnum::MessageOutputTextLogprobs]),
            ..Default::default()
        };
-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();

        let mut found_text = false;
        for item in &resp.inner.output {
@@ -1651,7 +1657,7 @@ thinking
            truncation: Some(Truncation::Auto),
            ..Default::default()
        };
-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
        assert_eq!(resp.inner.truncation, Some(Truncation::Auto));
    }

@@ -1659,7 +1665,7 @@ thinking
    fn test_truncation_defaults_to_disabled() {
        let chat_resp = make_chat_resp_with_text("hello");
        let params = ResponseParams::default();
-        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
        assert_eq!(resp.inner.truncation, Some(Truncation::Disabled));
    }
 }
--- a/lib/llm/src/protocols/openai/responses/stream_converter.rs
+++ b/lib/llm/src/protocols/openai/responses/stream_converter.rs
@@ -28,12 +28,15 @@ use dynamo_async_openai::types::ChatCompletionMessageContent;

 use super::ResponseParams;
 use crate::protocols::openai::chat_completions::NvCreateChatCompletionStreamResponse;
+use crate::protocols::unified::ResponsesContext;

 /// State machine that converts a chat completion stream into Responses API events.
 pub struct ResponseStreamConverter {
    response_id: String,
    model: String,
    params: ResponseParams,
+    /// Preserved Responses API-specific request context for faithful response reconstruction.
+    api_context: Option<ResponsesContext>,
    created_at: u64,
    sequence_number: u64,
    // Text message tracking
@@ -72,6 +75,7 @@ impl ResponseStreamConverter {
            response_id: format!("resp_{}", Uuid::new_v4().simple()),
            model,
            params,
+            api_context: None,
            created_at,
            sequence_number: 0,
            message_item_id: format!("msg_{}", Uuid::new_v4().simple()),
@@ -84,6 +88,12 @@ impl ResponseStreamConverter {
        }
    }

+    pub fn with_context(model: String, params: ResponseParams, context: ResponsesContext) -> Self {
+        let mut converter = Self::new(model, params);
+        converter.api_context = Some(context);
+        converter
+    }
+
    fn next_seq(&mut self) -> u64 {
        let seq = self.sequence_number;
        self.sequence_number += 1;
@@ -116,7 +126,12 @@ impl ResponseStreamConverter {
            parallel_tool_calls: Some(true),
            presence_penalty: Some(0.0),
            // store: false because this branch does not persist responses.
-            store: self.params.store.or(Some(false)),
+            store: self
+                .api_context
+                .as_ref()
+                .map(|ctx| ctx.store)
+                .or(self.params.store)
+                .or(Some(false)),
            temperature: self.params.temperature.or(Some(1.0)),
            text: Some(self.params.text.clone().unwrap_or(ResponseTextParam {
                format: TextResponseFormatConfiguration::Text,
@@ -144,7 +159,10 @@ impl ResponseStreamConverter {
            instructions: self.params.instructions.clone().map(Instructions::Text),
            max_output_tokens: self.params.max_output_tokens,
            max_tool_calls: None,
-            previous_response_id: None,
+            previous_response_id: self
+                .api_context
+                .as_ref()
+                .and_then(|ctx| ctx.previous_response_id.clone()),
            prompt: None,
            prompt_cache_key: None,
            prompt_cache_retention: None,
@@ -654,6 +672,7 @@ fn get_event_type(event: &ResponseStreamEvent) -> &'static str {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::protocols::unified::ResponsesContext;
    use dynamo_async_openai::types::{
        ChatChoiceStream, ChatCompletionMessageContent, ChatCompletionMessageToolCallChunk,
        ChatCompletionStreamResponseDelta, ChatCompletionToolType, FunctionCallStream,
@@ -912,4 +931,41 @@ mod tests {
            "output_item.done inline after text: {tool_types:?}"
        );
    }
+
+    /// Verify that `with_context` populates `previous_response_id` and `store`
+    /// in the generated Response objects.
+    #[test]
+    fn test_with_context_enriches_response() {
+        let ctx = ResponsesContext {
+            previous_response_id: Some("resp_prev_123".to_string()),
+            store: true,
+            ..Default::default()
+        };
+        let params = ResponseParams::default();
+        let mut conv = ResponseStreamConverter::with_context("test-model".into(), params, ctx);
+
+        // Process one text chunk so there's output
+        let _ = conv.emit_start_events();
+        let _ = conv.process_chunk(&text_chunk("Hello"));
+        let _end_events = conv.emit_end_events();
+
+        // Verify the Response object carries the context values through
+        let response = conv.make_response(Status::Completed, vec![]);
+        assert_eq!(
+            response.previous_response_id.as_deref(),
+            Some("resp_prev_123")
+        );
+        assert_eq!(response.store, Some(true));
+    }
+
+    /// Without context, previous_response_id is None and store defaults to false.
+    #[test]
+    fn test_without_context_defaults() {
+        let params = ResponseParams::default();
+        let conv = ResponseStreamConverter::new("test-model".into(), params);
+
+        let response = conv.make_response(Status::Completed, vec![]);
+        assert_eq!(response.previous_response_id, None);
+        assert_eq!(response.store, Some(false));
+    }
 }
--- a/lib/llm/src/protocols/unified.rs
+++ b/lib/llm/src/protocols/unified.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Unified internal request representation.
+//!
+//! `UnifiedRequest` is an API-agnostic wrapper that carries a fully-converted
+//! `NvCreateChatCompletionRequest` alongside the API-specific context that
+//! would otherwise be lost during the fan-in conversion.
+//!
+//! # Motivation
+//!
+//! Dynamo's HTTP frontend uses an hourglass architecture: multiple API surfaces
+//! (Chat Completions, Anthropic Messages, Responses) fan in through `TryFrom`
+//! to `NvCreateChatCompletionRequest`. Non-OpenAI features are lossy-compressed
+//! or silently dropped during this conversion. `UnifiedRequest` preserves that
+//! context so it can flow through the preprocessor and be used on the response
+//! path for faithful reconstruction.
+//!
+//! # Architecture
+//!
+//! ```text
+//! Anthropic Messages ──┐
+//! OpenAI Responses ────┼──→ UnifiedRequest { inner: NvCreateChatCompletion, api_context, ... }
+//! OpenAI Chat ─────────┘            │
+//!                                   ↓
+//!                          PreprocessedRequest ──→ Backend
+//! ```
+//!
+//! The existing preprocessor pipeline is unchanged — `UnifiedRequest` implements
+//! all the same traits (`OAIChatLikeRequest`, `SamplingOptionsProvider`, etc.)
+//! by delegating to the inner `NvCreateChatCompletionRequest`. The additional
+//! context fields are carried through for response-path use.
+
+use std::collections::HashMap;
+
+use dynamo_runtime::protocols::annotated::AnnotationsProvider;
+use serde::{Deserialize, Serialize};
+
+use crate::preprocessor::media::MediaDecoder;
+use crate::preprocessor::prompt::{OAIChatLikeRequest, TextInput};
+
+use crate::protocols::openai::chat_completions::NvCreateChatCompletionRequest;
+use crate::protocols::openai::common_ext::{CommonExt, CommonExtProvider};
+use crate::protocols::openai::nvext::{CacheControl, NvExt, NvExtProvider};
+use crate::protocols::openai::{
+    OpenAIOutputOptionsProvider, OpenAISamplingOptionsProvider, OpenAIStopConditionsProvider,
+};
+
+use dynamo_async_openai::types::responses::{IncludeEnum, Reasoning, Truncation};
+
+use super::anthropic::types::{AnthropicCreateMessageRequest, ThinkingConfig};
+use super::openai::responses::NvCreateResponse;
+
+/// Identifies which API surface originated the request and carries
+/// fields specific to that API that cannot be represented in the
+/// OpenAI Chat Completions format.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum ApiContext {
+    /// Request came from the OpenAI Chat Completions API.
+    /// All fields are natively represented in `NvCreateChatCompletionRequest`.
+    ChatCompletions,
+
+    /// Request came from the Anthropic Messages API.
+    Anthropic(AnthropicContext),
+
+    /// Request came from the OpenAI Responses API.
+    Responses(ResponsesContext),
+}
+
+/// Anthropic-specific fields preserved from `AnthropicCreateMessageRequest`.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct AnthropicContext {
+    /// Extended thinking configuration (`type` + `budget_tokens`).
+    /// Dropped during conversion because `NvCreateChatCompletionRequest` has
+    /// no equivalent — only `reasoning_effort` (a string) exists.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub thinking: Option<ThinkingConfig>,
+
+    /// Per-block cache control breakpoints with their position in the
+    /// message array. The existing Anthropic→Chat Completions conversion
+    /// collapses all per-block `cache_control` annotations into a single
+    /// last-one-wins `nvext.cache_control` field. This preserves the full
+    /// per-block granularity for future use (e.g., multi-breakpoint prefix
+    /// caching, or faithfully reporting per-breakpoint `cache_creation_input_tokens`
+    /// / `cache_read_input_tokens` in the response).
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub cache_breakpoints: Vec<CacheBreakpoint>,
+
+    /// When true, the model should not issue parallel tool calls.
+    /// The Anthropic API supports `disable_parallel_tool_use` on the tool_choice
+    /// object but there is no OpenAI equivalent field.
+    #[serde(default)]
+    pub disable_parallel_tool_use: bool,
+
+    /// Anthropic request metadata (e.g. `user_id`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<serde_json::Value>,
+
+    /// Service tier selection from the Anthropic request.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<String>,
+
+    /// Container identifier for stateful sandbox sessions.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub container: Option<String>,
+
+    /// Output configuration (effort level, JSON schema format).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub output_config: Option<serde_json::Value>,
+}
+
+/// Responses API-specific fields preserved from `NvCreateResponse`.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct ResponsesContext {
+    /// Conversation continuation identifier.
+    /// Dropped during conversion — no OpenAI Chat equivalent.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub previous_response_id: Option<String>,
+
+    /// Context truncation strategy.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub truncation: Option<Truncation>,
+
+    /// Reasoning configuration (effort + optional summary generation).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub reasoning: Option<Reasoning>,
+
+    /// Output items to include in the response.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub include: Option<Vec<IncludeEnum>>,
+
+    /// Whether responses should be stored server-side.
+    #[serde(default)]
+    pub store: bool,
+}
+
+/// A cache breakpoint records the position (message index, block index)
+/// and the cache control directive from the original Anthropic request.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CacheBreakpoint {
+    /// Index of the message in the original messages array.
+    pub message_index: usize,
+    /// Index of the content block within the message (0 for plain-text messages).
+    pub block_index: usize,
+    /// The cache control directive.
+    pub cache_control: CacheControl,
+}
+
+/// API-agnostic request wrapper that preserves the full context from any
+/// API surface while remaining compatible with the existing preprocessor.
+#[derive(Debug, Clone)]
+pub struct UnifiedRequest {
+    /// The core request in OpenAI Chat Completions format.
+    /// This is what the preprocessor already knows how to handle.
+    pub inner: NvCreateChatCompletionRequest,
+
+    /// Which API surface originated this request, plus API-specific fields
+    /// that were dropped during conversion to `NvCreateChatCompletionRequest`.
+    pub api_context: ApiContext,
+}
+
+impl From<NvCreateChatCompletionRequest> for UnifiedRequest {
+    fn from(req: NvCreateChatCompletionRequest) -> Self {
+        Self {
+            inner: req,
+            api_context: ApiContext::ChatCompletions,
+        }
+    }
+}
+
+impl TryFrom<AnthropicCreateMessageRequest> for UnifiedRequest {
+    type Error = anyhow::Error;
+
+    fn try_from(req: AnthropicCreateMessageRequest) -> Result<Self, Self::Error> {
+        // Capture API-specific fields BEFORE the lossy conversion
+        let anthropic_ctx = AnthropicContext {
+            thinking: req.thinking.clone(),
+            cache_breakpoints: extract_cache_breakpoints(&req),
+            disable_parallel_tool_use: extract_disable_parallel_tool_use(&req),
+            metadata: req.metadata.clone(),
+            service_tier: req.service_tier.clone(),
+            container: req.container.clone(),
+            output_config: req.output_config.clone(),
+        };
+
+        // Perform the existing lossy conversion
+        let inner: NvCreateChatCompletionRequest = req.try_into()?;
+
+        Ok(Self {
+            inner,
+            api_context: ApiContext::Anthropic(anthropic_ctx),
+        })
+    }
+}
+
+impl TryFrom<NvCreateResponse> for UnifiedRequest {
+    type Error = anyhow::Error;
+
+    fn try_from(req: NvCreateResponse) -> Result<Self, Self::Error> {
+        // Capture API-specific fields BEFORE the lossy conversion
+        let responses_ctx = ResponsesContext {
+            previous_response_id: req.inner.previous_response_id.clone(),
+            truncation: req.inner.truncation,
+            reasoning: req.inner.reasoning.clone(),
+            include: req.inner.include.clone(),
+            store: req.inner.store.unwrap_or(false),
+        };
+
+        // Perform the existing lossy conversion
+        let inner: NvCreateChatCompletionRequest = req.try_into()?;
+
+        Ok(Self {
+            inner,
+            api_context: ApiContext::Responses(responses_ctx),
+        })
+    }
+}
+
+/// Walk the Anthropic message array and collect per-block cache_control
+/// annotations with their (message_index, block_index) positions.
+fn extract_cache_breakpoints(req: &AnthropicCreateMessageRequest) -> Vec<CacheBreakpoint> {
+    use super::anthropic::types::{AnthropicContentBlock, AnthropicMessageContent};
+
+    let mut breakpoints = Vec::new();
+
+    // System-level cache control
+    if let Some(system) = &req.system
+        && let Some(cc) = &system.cache_control
+    {
+        breakpoints.push(CacheBreakpoint {
+            message_index: 0, // system is logically position 0
+            block_index: 0,
+            cache_control: cc.clone(),
+        });
+    }
+
+    let offset = if req.system.is_some() { 1 } else { 0 };
+
+    for (msg_idx, msg) in req.messages.iter().enumerate() {
+        if let AnthropicMessageContent::Blocks { content } = &msg.content {
+            for (block_idx, block) in content.iter().enumerate() {
+                let cc = match block {
+                    AnthropicContentBlock::Text { cache_control, .. } => cache_control.as_ref(),
+                    AnthropicContentBlock::ToolUse { cache_control, .. } => cache_control.as_ref(),
+                    AnthropicContentBlock::ToolResult { cache_control, .. } => {
+                        cache_control.as_ref()
+                    }
+                    AnthropicContentBlock::Thinking { cache_control, .. } => cache_control.as_ref(),
+                    _ => None,
+                };
+                if let Some(cc) = cc {
+                    breakpoints.push(CacheBreakpoint {
+                        message_index: msg_idx + offset,
+                        block_index: block_idx,
+                        cache_control: cc.clone(),
+                    });
+                }
+            }
+        }
+    }
+
+    breakpoints
+}
+
+/// Extract `disable_parallel_tool_use` from the Anthropic tool_choice.
+/// The Anthropic API allows `{"type": "auto", "disable_parallel_tool_use": true}`
+/// but there's no OpenAI Chat equivalent.
+fn extract_disable_parallel_tool_use(req: &AnthropicCreateMessageRequest) -> bool {
+    use super::anthropic::types::AnthropicToolChoice;
+
+    match &req.tool_choice {
+        Some(AnthropicToolChoice::Simple(simple)) => {
+            simple.disable_parallel_tool_use.unwrap_or(false)
+        }
+        Some(AnthropicToolChoice::Named(named)) => named.disable_parallel_tool_use.unwrap_or(false),
+        None => false,
+    }
+}
+
+// Trait implementations — delegate to inner NvCreateChatCompletionRequest
+
+impl NvExtProvider for UnifiedRequest {
+    fn nvext(&self) -> Option<&NvExt> {
+        self.inner.nvext.as_ref()
+    }
+
+    fn raw_prompt(&self) -> Option<String> {
+        None
+    }
+
+    /// Returns the single collapsed cache control from `nvext`. This is the
+    /// last-one-wins value produced by the Anthropic→Chat Completions conversion
+    /// and is sufficient for backends that support a single prefix cache boundary
+    /// (SGLang, vLLM). For per-block granularity, consult
+    /// `AnthropicContext::cache_breakpoints` via the `ApiContext` sidecar.
+    fn effective_cache_control(&self) -> Option<&CacheControl> {
+        NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
+    }
+}
+
+impl AnnotationsProvider for UnifiedRequest {
+    fn annotations(&self) -> Option<Vec<String>> {
+        self.inner
+            .nvext
+            .as_ref()
+            .and_then(|nvext| nvext.annotations.clone())
+    }
+
+    fn has_annotation(&self, annotation: &str) -> bool {
+        self.inner
+            .nvext
+            .as_ref()
+            .and_then(|nvext| nvext.annotations.as_ref())
+            .map(|annotations| annotations.contains(&annotation.to_string()))
+            .unwrap_or(false)
+    }
+}
+
+impl OpenAISamplingOptionsProvider for UnifiedRequest {
+    fn get_temperature(&self) -> Option<f32> {
+        self.inner.inner.temperature
+    }
+
+    fn get_top_p(&self) -> Option<f32> {
+        self.inner.inner.top_p
+    }
+
+    fn get_frequency_penalty(&self) -> Option<f32> {
+        self.inner.inner.frequency_penalty
+    }
+
+    fn get_presence_penalty(&self) -> Option<f32> {
+        self.inner.inner.presence_penalty
+    }
+
+    fn nvext(&self) -> Option<&NvExt> {
+        self.inner.nvext.as_ref()
+    }
+
+    fn get_seed(&self) -> Option<i64> {
+        self.inner.inner.seed
+    }
+
+    fn get_n(&self) -> Option<u8> {
+        self.inner.inner.n
+    }
+
+    fn get_best_of(&self) -> Option<u8> {
+        OpenAISamplingOptionsProvider::get_best_of(&self.inner)
+    }
+}
+
+impl CommonExtProvider for UnifiedRequest {
+    fn common_ext(&self) -> Option<&CommonExt> {
+        Some(&self.inner.common)
+    }
+
+    fn get_guided_json(&self) -> Option<serde_json::Value> {
+        // Delegate to the inner impl which handles tool_choice → guided_json
+        // and response_format → guided_json derivation.
+        CommonExtProvider::get_guided_json(&self.inner)
+    }
+
+    fn get_guided_regex(&self) -> Option<String> {
+        self.inner.common.guided_regex.clone()
+    }
+
+    fn get_guided_grammar(&self) -> Option<String> {
+        self.inner.common.guided_grammar.clone()
+    }
+
+    fn get_guided_choice(&self) -> Option<Vec<String>> {
+        self.inner.common.guided_choice.clone()
+    }
+
+    fn get_guided_decoding_backend(&self) -> Option<String> {
+        self.inner.common.guided_decoding_backend.clone()
+    }
+
+    fn get_guided_whitespace_pattern(&self) -> Option<String> {
+        self.inner.common.guided_whitespace_pattern.clone()
+    }
+
+    fn get_top_k(&self) -> Option<i32> {
+        self.inner.common.top_k
+    }
+
+    fn get_min_p(&self) -> Option<f32> {
+        self.inner.common.min_p
+    }
+
+    fn get_repetition_penalty(&self) -> Option<f32> {
+        self.inner.common.repetition_penalty
+    }
+
+    fn get_include_stop_str_in_output(&self) -> Option<bool> {
+        self.inner.common.include_stop_str_in_output
+    }
+
+    fn get_skip_special_tokens(&self) -> Option<bool> {
+        self.inner.common.skip_special_tokens
+    }
+}
+
+impl OpenAIStopConditionsProvider for UnifiedRequest {
+    #[allow(deprecated)]
+    fn get_max_tokens(&self) -> Option<u32> {
+        self.inner
+            .inner
+            .max_completion_tokens
+            .or(self.inner.inner.max_tokens)
+    }
+
+    fn get_min_tokens(&self) -> Option<u32> {
+        self.inner.common.min_tokens
+    }
+
+    fn get_stop(&self) -> Option<Vec<String>> {
+        self.inner.inner.stop.as_ref().map(|stop| match stop {
+            dynamo_async_openai::types::Stop::String(s) => vec![s.clone()],
+            dynamo_async_openai::types::Stop::StringArray(arr) => arr.clone(),
+        })
+    }
+
+    fn nvext(&self) -> Option<&NvExt> {
+        self.inner.nvext.as_ref()
+    }
+
+    fn get_common_ignore_eos(&self) -> Option<bool> {
+        self.inner.common.ignore_eos
+    }
+}
+
+impl OpenAIOutputOptionsProvider for UnifiedRequest {
+    fn get_logprobs(&self) -> Option<u32> {
+        match self.inner.inner.logprobs {
+            Some(true) => match self.inner.inner.top_logprobs {
+                Some(top_logprobs) => Some(top_logprobs as u32),
+                None => Some(1_u32),
+            },
+            Some(false) => None,
+            None => None,
+        }
+    }
+
+    fn get_prompt_logprobs(&self) -> Option<u32> {
+        OpenAIOutputOptionsProvider::get_prompt_logprobs(&self.inner)
+    }
+
+    fn get_skip_special_tokens(&self) -> Option<bool> {
+        OpenAIOutputOptionsProvider::get_skip_special_tokens(&self.inner)
+    }
+
+    fn get_formatted_prompt(&self) -> Option<bool> {
+        OpenAIOutputOptionsProvider::get_formatted_prompt(&self.inner)
+    }
+}
+
+impl OAIChatLikeRequest for UnifiedRequest {
+    fn model(&self) -> String {
+        self.inner.inner.model.clone()
+    }
+
+    fn messages(&self) -> minijinja::value::Value {
+        let messages_json = serde_json::to_value(&self.inner.inner.messages).unwrap();
+        minijinja::value::Value::from_serialize(&messages_json)
+    }
+
+    fn typed_messages(
+        &self,
+    ) -> Option<&[dynamo_async_openai::types::ChatCompletionRequestMessage]> {
+        Some(self.inner.inner.messages.as_slice())
+    }
+
+    fn tools(&self) -> Option<minijinja::value::Value> {
+        OAIChatLikeRequest::tools(&self.inner)
+    }
+
+    fn tool_choice(&self) -> Option<minijinja::value::Value> {
+        OAIChatLikeRequest::tool_choice(&self.inner)
+    }
+
+    fn response_format(&self) -> Option<minijinja::value::Value> {
+        OAIChatLikeRequest::response_format(&self.inner)
+    }
+
+    fn should_add_generation_prompt(&self) -> bool {
+        OAIChatLikeRequest::should_add_generation_prompt(&self.inner)
+    }
+
+    fn extract_text(&self) -> Option<TextInput> {
+        OAIChatLikeRequest::extract_text(&self.inner)
+    }
+
+    fn chat_template_args(&self) -> Option<&HashMap<String, serde_json::Value>> {
+        self.inner.chat_template_args.as_ref()
+    }
+
+    fn media_io_kwargs(&self) -> Option<&MediaDecoder> {
+        self.inner.media_io_kwargs.as_ref()
+    }
+}
+
+impl UnifiedRequest {
+    /// Returns the Anthropic context if this request originated from the
+    /// Anthropic Messages API.
+    pub fn anthropic_context(&self) -> Option<&AnthropicContext> {
+        match &self.api_context {
+            ApiContext::Anthropic(ctx) => Some(ctx),
+            _ => None,
+        }
+    }
+
+    /// Returns the Responses context if this request originated from the
+    /// OpenAI Responses API.
+    pub fn responses_context(&self) -> Option<&ResponsesContext> {
+        match &self.api_context {
+            ApiContext::Responses(ctx) => Some(ctx),
+            _ => None,
+        }
+    }
+
+    /// Unwrap back to the inner `NvCreateChatCompletionRequest`.
+    /// Useful for gradual migration — callers that don't need the extra
+    /// context can unwrap and use the existing code paths unchanged.
+    pub fn into_inner(self) -> NvCreateChatCompletionRequest {
+        self.inner
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_chat_completions_roundtrip() {
+        let req = NvCreateChatCompletionRequest {
+            inner: dynamo_async_openai::types::CreateChatCompletionRequest {
+                model: "test-model".to_string(),
+                messages: vec![],
+                ..Default::default()
+            },
+            common: CommonExt::default(),
+            nvext: None,
+            chat_template_args: None,
+            media_io_kwargs: None,
+            unsupported_fields: Default::default(),
+        };
+
+        let unified = UnifiedRequest::from(req.clone());
+        assert!(matches!(unified.api_context, ApiContext::ChatCompletions));
+        assert_eq!(unified.model(), "test-model");
+    }
+
+    #[test]
+    fn test_anthropic_context_preserved() {
+        use super::super::anthropic::types::*;
+
+        let req = AnthropicCreateMessageRequest {
+            model: "claude-sonnet-4-20250514".to_string(),
+            max_tokens: 1024,
+            messages: vec![AnthropicMessage {
+                role: AnthropicRole::User,
+                content: AnthropicMessageContent::Text {
+                    content: "Hello".to_string(),
+                },
+            }],
+            system: None,
+            temperature: Some(0.7),
+            top_p: None,
+            top_k: None,
+            stop_sequences: None,
+            stream: true,
+            metadata: Some(serde_json::json!({"user_id": "test"})),
+            tools: None,
+            tool_choice: None,
+            cache_control: None,
+            thinking: Some(ThinkingConfig {
+                thinking_type: "enabled".to_string(),
+                budget_tokens: Some(4096),
+            }),
+            service_tier: None,
+            container: None,
+            output_config: None,
+        };
+
+        let unified = UnifiedRequest::try_from(req).unwrap();
+
+        // Verify the context was preserved
+        let ctx = unified.anthropic_context().unwrap();
+        assert!(ctx.thinking.is_some());
+        assert_eq!(ctx.thinking.as_ref().unwrap().thinking_type, "enabled");
+        assert_eq!(ctx.thinking.as_ref().unwrap().budget_tokens, Some(4096));
+        assert!(ctx.metadata.is_some());
+
+        // Verify it still works as a preprocessor input
+        assert_eq!(unified.model(), "claude-sonnet-4-20250514");
+        assert!(unified.extract_text().is_some());
+    }
+
+    #[test]
+    fn test_responses_context_preserved() {
+        // Construct an NvCreateResponse via JSON to satisfy all required fields
+        let json = serde_json::json!({
+            "model": "gpt-4o",
+            "input": "What is the capital of France?",
+            "previous_response_id": "resp_abc123",
+            "store": true,
+            "truncation": "auto",
+            "reasoning": {
+                "effort": "medium"
+            },
+            "include": ["message.output_text.logprobs"]
+        });
+        let req: NvCreateResponse = serde_json::from_value(json).unwrap();
+
+        let unified = UnifiedRequest::try_from(req).unwrap();
+
+        let ctx = unified.responses_context().unwrap();
+        assert_eq!(ctx.previous_response_id.as_deref(), Some("resp_abc123"));
+        assert!(ctx.store);
+        assert!(ctx.truncation.is_some());
+        assert!(ctx.reasoning.is_some());
+        assert!(ctx.include.is_some());
+        assert_eq!(ctx.include.as_ref().unwrap().len(), 1);
+
+        // Verify it still works as a preprocessor input
+        assert_eq!(unified.model(), "gpt-4o");
+    }
+}