refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)

2887cd1c · ishandhanani · GitHub · d6136f4a · 2887cd1c · 2887cd1c
Unverified Commit 2887cd1c authored Mar 30, 2026 by ishandhanani Committed by GitHub Mar 30, 2026
12 changed files
--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -214,50 +214,9 @@ pub struct AgentHints {
    pub latency_sensitivity: Option<f64>,
 }
-/// Anthropic-style cache control hint for prefix pinning with TTL.
+// Re-export CacheControl types from dynamo-async-openai where they are canonically defined
-#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
+// alongside the Anthropic protocol types they originate from.
-pub struct CacheControl {
+pub use dynamo_async_openai::types::anthropic::{CacheControl, CacheControlType};
-    #[serde(rename = "type")]
-    pub control_type: CacheControlType,
-    /// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub ttl: Option<String>,
-}
-#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
-#[serde(rename_all = "lowercase")]
-pub enum CacheControlType {
-    #[default]
-    Ephemeral,
-    #[serde(other)]
-    Unknown,
-}
-const MIN_TTL_SECONDS: u64 = 300;
-const MAX_TTL_SECONDS: u64 = 3600;
-impl CacheControl {
-    /// Parse TTL string to seconds, clamped to [300, 3600].
-    ///
-    /// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
-    /// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
-    /// Unrecognized strings default to 300s.
-    pub fn ttl_seconds(&self) -> u64 {
-        let raw = match self.ttl.as_deref() {
-            None => return MIN_TTL_SECONDS,
-            Some("5m") => 300,
-            Some("1h") => 3600,
-            Some(other) => match other.parse::<u64>() {
-                Ok(secs) => secs,
-                Err(_) => {
-                    tracing::warn!("Unrecognized TTL '{}', defaulting to 300s", other);
-                    return MIN_TTL_SECONDS;
-                }
-            },
-        };
-        raw.clamp(MIN_TTL_SECONDS, MAX_TTL_SECONDS)
-    }
-}
 impl Default for NvExt {
    fn default() -> Self {

--- a/lib/llm/src/protocols/openai/responses/mod.rs
+++ b/lib/llm/src/protocols/openai/responses/mod.rs
@@ -696,8 +696,8 @@ pub fn chat_completion_to_response(
    nv_resp: NvCreateChatCompletionResponse,
    params: &ResponseParams,
 ) -> Result<NvResponse, anyhow::Error> {
-    let chat_resp = nv_resp;
+    let nvext = nv_resp.nvext.clone();
-    let nvext = chat_resp.nvext.clone();
+    let chat_resp = nv_resp.inner;
    let message_id = format!("msg_{}", Uuid::new_v4().simple());
    let response_id = format!("resp_{}", Uuid::new_v4().simple());
@@ -1163,32 +1163,34 @@ mod tests {
    fn test_into_nvresponse_from_chat_response() {
        let now = 1_726_000_000;
        let chat_resp = NvCreateChatCompletionResponse {
-            id: "chatcmpl-xyz".into(),
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-            choices: vec![dynamo_async_openai::types::ChatChoice {
+                id: "chatcmpl-xyz".into(),
-                index: 0,
+                choices: vec![dynamo_async_openai::types::ChatChoice {
-                message: dynamo_async_openai::types::ChatCompletionResponseMessage {
+                    index: 0,
-                    content: Some(
+                    message: dynamo_async_openai::types::ChatCompletionResponseMessage {
-                        dynamo_async_openai::types::ChatCompletionMessageContent::Text(
+                        content: Some(
-                            "This is a reply".to_string(),
+                            dynamo_async_openai::types::ChatCompletionMessageContent::Text(
+                                "This is a reply".to_string(),
+                            ),
                        ),
-                    ),
+                        refusal: None,
-                    refusal: None,
+                        tool_calls: None,
-                    tool_calls: None,
+                        role: dynamo_async_openai::types::Role::Assistant,
-                    role: dynamo_async_openai::types::Role::Assistant,
+                        function_call: None,
-                    function_call: None,
+                        audio: None,
-                    audio: None,
+                        reasoning_content: None,
-                    reasoning_content: None,
+                    },
-                },
+                    finish_reason: None,
-                finish_reason: None,
+                    stop_reason: None,
-                stop_reason: None,
+                    logprobs: None,
-                logprobs: None,
+                }],
-            }],
+                created: now,
-            created: now,
+                model: "llama-3.1-8b-instruct".into(),
-            model: "llama-3.1-8b-instruct".into(),
+                service_tier: None,
-            service_tier: None,
+                system_fingerprint: None,
-            system_fingerprint: None,
+                object: "chat.completion".to_string(),
-            object: "chat.completion".to_string(),
+                usage: None,
-            usage: None,
+            },
            nvext: None,
        };
@@ -1218,35 +1220,37 @@ mod tests {
    fn test_response_with_tool_calls() {
        let now = 1_726_000_000;
        let chat_resp = NvCreateChatCompletionResponse {
-            id: "chatcmpl-xyz".into(),
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-            choices: vec![dynamo_async_openai::types::ChatChoice {
+                id: "chatcmpl-xyz".into(),
-                index: 0,
+                choices: vec![dynamo_async_openai::types::ChatChoice {
-                message: dynamo_async_openai::types::ChatCompletionResponseMessage {
+                    index: 0,
-                    content: None,
+                    message: dynamo_async_openai::types::ChatCompletionResponseMessage {
-                    refusal: None,
+                        content: None,
-                    tool_calls: Some(vec![ChatCompletionMessageToolCall {
+                        refusal: None,
-                        id: "call_abc".into(),
+                        tool_calls: Some(vec![ChatCompletionMessageToolCall {
-                        r#type: ChatCompletionToolType::Function,
+                            id: "call_abc".into(),
-                        function: dynamo_async_openai::types::FunctionCall {
+                            r#type: ChatCompletionToolType::Function,
-                            name: "get_weather".into(),
+                            function: dynamo_async_openai::types::FunctionCall {
-                            arguments: r#"{"location":"SF"}"#.into(),
+                                name: "get_weather".into(),
-                        },
+                                arguments: r#"{"location":"SF"}"#.into(),
-                    }]),
+                            },
-                    role: dynamo_async_openai::types::Role::Assistant,
+                        }]),
-                    function_call: None,
+                        role: dynamo_async_openai::types::Role::Assistant,
-                    audio: None,
+                        function_call: None,
-                    reasoning_content: None,
+                        audio: None,
-                },
+                        reasoning_content: None,
-                finish_reason: None,
+                    },
-                stop_reason: None,
+                    finish_reason: None,
-                logprobs: None,
+                    stop_reason: None,
-            }],
+                    logprobs: None,
-            created: now,
+                }],
-            model: "test-model".into(),
+                created: now,
-            service_tier: None,
+                model: "test-model".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion".to_string(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion".to_string(),
+                usage: None,
+            },
            nvext: None,
        };
@@ -1432,14 +1436,16 @@ thinking
        };
        let chat_resp = NvCreateChatCompletionResponse {
-            choices: vec![],
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-            created: 0,
+                choices: vec![],
-            id: "test".into(),
+                created: 0,
-            model: "m".into(),
+                id: "test".into(),
-            service_tier: None,
+                model: "m".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion".into(),
+                usage: None,
+            },
            nvext: None,
        };
@@ -1463,14 +1469,16 @@ thinking
        };
        let chat_resp = NvCreateChatCompletionResponse {
-            choices: vec![],
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-            created: 0,
+                choices: vec![],
-            id: "test".into(),
+                created: 0,
-            model: "m".into(),
+                id: "test".into(),
-            service_tier: None,
+                model: "m".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion".into(),
+                usage: None,
+            },
            nvext: None,
        };
@@ -1489,14 +1497,16 @@ thinking
        };
        let chat_resp = NvCreateChatCompletionResponse {
-            choices: vec![],
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-            created: 0,
+                choices: vec![],
-            id: "test".into(),
+                created: 0,
-            model: "m".into(),
+                id: "test".into(),
-            service_tier: None,
+                model: "m".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion".into(),
+                usage: None,
+            },
            nvext: None,
        };
@@ -1555,29 +1565,31 @@ thinking
            ChatChoice, ChatCompletionMessageContent, ChatCompletionResponseMessage, FinishReason,
        };
        NvCreateChatCompletionResponse {
-            choices: vec![ChatChoice {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-                index: 0,
+                choices: vec![ChatChoice {
-                #[allow(deprecated)]
+                    index: 0,
-                message: ChatCompletionResponseMessage {
+                    #[allow(deprecated)]
-                    content: Some(ChatCompletionMessageContent::Text(text.into())),
+                    message: ChatCompletionResponseMessage {
-                    role: dynamo_async_openai::types::Role::Assistant,
+                        content: Some(ChatCompletionMessageContent::Text(text.into())),
-                    tool_calls: None,
+                        role: dynamo_async_openai::types::Role::Assistant,
-                    refusal: None,
+                        tool_calls: None,
-                    reasoning_content: None,
+                        refusal: None,
-                    function_call: None,
+                        reasoning_content: None,
-                    audio: None,
+                        function_call: None,
-                },
+                        audio: None,
-                finish_reason: Some(FinishReason::Stop),
+                    },
-                stop_reason: None,
+                    finish_reason: Some(FinishReason::Stop),
-                logprobs: None,
+                    stop_reason: None,
-            }],
+                    logprobs: None,
-            created: 0,
+                }],
-            id: "test".into(),
+                created: 0,
-            model: "m".into(),
+                id: "test".into(),
-            service_tier: None,
+                model: "m".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/src/protocols/openai/responses/stream_converter.rs
+++ b/lib/llm/src/protocols/openai/responses/stream_converter.rs
@@ -183,7 +183,7 @@ impl ResponseStreamConverter {
        let mut events = Vec::new();
        // Capture usage stats from the final chunk (sent when stream_options.include_usage=true)
-        if let Some(ref u) = chunk.usage {
+        if let Some(ref u) = chunk.inner.usage {
            self.usage = Some(ResponseUsage {
                input_tokens: u.prompt_tokens,
                input_tokens_details: InputTokenDetails {
@@ -205,7 +205,7 @@ impl ResponseStreamConverter {
            });
        }
-        for choice in &chunk.choices {
+        for choice in &chunk.inner.choices {
            let delta = &choice.delta;
            // Handle text content deltas — extract text from the enum
@@ -685,35 +685,37 @@ mod tests {
    ) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "chat-1".into(),
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-            choices: vec![ChatChoiceStream {
+                id: "chat-1".into(),
-                index: 0,
+                choices: vec![ChatChoiceStream {
-                delta: ChatCompletionStreamResponseDelta {
+                    index: 0,
-                    content: None,
+                    delta: ChatCompletionStreamResponseDelta {
-                    function_call: None,
+                        content: None,
-                    tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
+                        function_call: None,
-                        index: tc_index,
+                        tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
-                        id: id.map(String::from),
+                            index: tc_index,
-                        r#type: Some(ChatCompletionToolType::Function),
+                            id: id.map(String::from),
-                        function: Some(FunctionCallStream {
+                            r#type: Some(ChatCompletionToolType::Function),
-                            name: name.map(String::from),
+                            function: Some(FunctionCallStream {
-                            arguments: args.map(String::from),
+                                name: name.map(String::from),
-                        }),
+                                arguments: args.map(String::from),
-                    }]),
+                            }),
-                    role: None,
+                        }]),
-                    refusal: None,
+                        role: None,
-                    reasoning_content: None,
+                        refusal: None,
-                },
+                        reasoning_content: None,
-                finish_reason: None,
+                    },
-                stop_reason: None,
+                    finish_reason: None,
-                logprobs: None,
+                    stop_reason: None,
-            }],
+                    logprobs: None,
-            created: 0,
+                }],
-            model: "test".into(),
+                created: 0,
-            service_tier: None,
+                model: "test".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion.chunk".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion.chunk".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }
@@ -721,27 +723,29 @@ mod tests {
    fn text_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "chat-1".into(),
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-            choices: vec![ChatChoiceStream {
+                id: "chat-1".into(),
-                index: 0,
+                choices: vec![ChatChoiceStream {
-                delta: ChatCompletionStreamResponseDelta {
+                    index: 0,
-                    content: Some(ChatCompletionMessageContent::Text(text.into())),
+                    delta: ChatCompletionStreamResponseDelta {
-                    function_call: None,
+                        content: Some(ChatCompletionMessageContent::Text(text.into())),
-                    tool_calls: None,
+                        function_call: None,
-                    role: None,
+                        tool_calls: None,
-                    refusal: None,
+                        role: None,
-                    reasoning_content: None,
+                        refusal: None,
-                },
+                        reasoning_content: None,
-                finish_reason: None,
+                    },
-                stop_reason: None,
+                    finish_reason: None,
-                logprobs: None,
+                    stop_reason: None,
-            }],
+                    logprobs: None,
-            created: 0,
+                }],
-            model: "test".into(),
+                created: 0,
-            service_tier: None,
+                model: "test".into(),
-            system_fingerprint: None,
+                service_tier: None,
-            object: "chat.completion.chunk".into(),
+                system_fingerprint: None,
-            usage: None,
+                object: "chat.completion.chunk".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/tests/aggregators.rs
+++ b/lib/llm/tests/aggregators.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-use dynamo_async_openai::types::ChatCompletionMessageContent;
+use dynamo_async_openai::types::{
+    ChatChoiceStream, ChatCompletionMessageContent, ChatCompletionStreamResponseDelta,
+    CreateChatCompletionStreamResponse, Role,
+};
 use dynamo_llm::protocols::{
-    ContentProvider, DataStream,
+    Annotated, ContentProvider, DataStream,
    codec::{Message, SseCodecError, create_message_stream},
    openai::{
        ParsingOptions,
-        chat_completions::{NvCreateChatCompletionResponse, aggregator::ChatCompletionAggregator},
+        chat_completions::{
+            NvCreateChatCompletionResponse, NvCreateChatCompletionStreamResponse,
+            aggregator::ChatCompletionAggregator,
+        },
        completions::NvCreateCompletionResponse,
    },
 };
@@ -45,6 +51,7 @@ async fn test_openai_chat_stream() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -70,6 +77,7 @@ async fn test_openai_chat_edge_case_multi_line_data() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -95,6 +103,7 @@ async fn test_openai_chat_edge_case_comments_per_response() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -138,3 +147,113 @@ async fn test_openai_cmpl_stream() {
        " This is a question that is often asked by those outside of AI research and development"
    );
 }
+// ===================================
+// nvext aggregation regression tests
+// ===================================
+#[allow(deprecated)]
+fn make_stream_delta(
+    content: Option<&str>,
+    nvext: Option<serde_json::Value>,
+) -> Annotated<NvCreateChatCompletionStreamResponse> {
+    Annotated::from_data(NvCreateChatCompletionStreamResponse {
+        inner: CreateChatCompletionStreamResponse {
+            id: "test-id".to_string(),
+            choices: if let Some(text) = content {
+                vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: Some(ChatCompletionMessageContent::Text(text.to_string())),
+                        function_call: None,
+                        tool_calls: None,
+                        role: Some(Role::Assistant),
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: None,
+                    stop_reason: None,
+                    logprobs: None,
+                }]
+            } else {
+                vec![]
+            },
+            created: 1234567890,
+            model: "test-model".to_string(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion.chunk".to_string(),
+            usage: None,
+        },
+        nvext,
+    })
+}
+/// Verify that nvext set on a stream delta survives aggregation into the final response.
+#[tokio::test]
+async fn test_nvext_passthrough_aggregation() {
+    let nvext_value = serde_json::json!({"custom_field": "test_value"});
+    let deltas = vec![
+        make_stream_delta(Some("Hello"), None),
+        make_stream_delta(Some(" world"), Some(nvext_value.clone())),
+        make_stream_delta(Some("!"), None),
+    ];
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+    assert_eq!(result.nvext, Some(nvext_value));
+    assert_eq!(
+        get_text(
+            result
+                .inner
+                .choices
+                .first()
+                .unwrap()
+                .message
+                .content
+                .as_ref()
+                .unwrap()
+        ),
+        "Hello world!"
+    );
+}
+/// Verify that the last non-None nvext wins when multiple deltas carry nvext.
+#[tokio::test]
+async fn test_nvext_last_value_wins() {
+    let first_nvext = serde_json::json!({"version": 1});
+    let last_nvext = serde_json::json!({"version": 2});
+    let deltas = vec![
+        make_stream_delta(Some("a"), Some(first_nvext)),
+        make_stream_delta(Some("b"), None),
+        make_stream_delta(Some("c"), Some(last_nvext.clone())),
+    ];
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+    assert_eq!(result.nvext, Some(last_nvext));
+}
+/// Verify that nvext remains None when no delta carries it.
+#[tokio::test]
+async fn test_nvext_none_when_absent() {
+    let deltas = vec![make_stream_delta(Some("hello"), None)];
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+    assert_eq!(result.nvext, None);
+}
--- a/lib/llm/tests/logprob_analysis_integration.rs
+++ b/lib/llm/tests/logprob_analysis_integration.rs
@@ -397,14 +397,16 @@ fn create_response_with_linear_probs(
    };
    NvCreateChatCompletionStreamResponse {
-        id: "test_id".to_string(),
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-        choices: vec![choice],
+            id: "test_id".to_string(),
-        created: 1234567890,
+            choices: vec![choice],
-        model: "test-model".to_string(),
+            created: 1234567890,
-        service_tier: None,
+            model: "test-model".to_string(),
-        system_fingerprint: None,
+            service_tier: None,
-        object: "chat.completion.chunk".to_string(),
+            system_fingerprint: None,
-        usage: None,
+            object: "chat.completion.chunk".to_string(),
+            usage: None,
+        },
        nvext: None,
    }
 }
@@ -479,14 +481,16 @@ fn create_multi_choice_response(
        .collect();
    NvCreateChatCompletionStreamResponse {
-        id: "test_id".to_string(),
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-        choices,
+            id: "test_id".to_string(),
-        created: 1234567890,
+            choices,
-        model: "test-model".to_string(),
+            created: 1234567890,
-        service_tier: None,
+            model: "test-model".to_string(),
-        system_fingerprint: None,
+            service_tier: None,
-        object: "chat.completion.chunk".to_string(),
+            system_fingerprint: None,
-        usage: None,
+            object: "chat.completion.chunk".to_string(),
+            usage: None,
+        },
        nvext: None,
    }
 }
--- a/lib/llm/tests/postprocessor_parsing_stream.rs
+++ b/lib/llm/tests/postprocessor_parsing_stream.rs
@@ -192,7 +192,7 @@ async fn postprocessor_parsing_stream_replays_interval_20_fixture() {
            continue;
        };
-        for choice in &output_data.choices {
+        for choice in &output_data.inner.choices {
            if let Some(reasoning_content) = &choice.delta.reasoning_content {
                reasoning.push_str(reasoning_content);
            }

--- a/lib/llm/tests/test_jail.rs
+++ b/lib/llm/tests/test_jail.rs
--- a/lib/llm/tests/test_reasoning_parser.rs
+++ b/lib/llm/tests/test_reasoning_parser.rs
@@ -39,14 +39,16 @@ fn create_mock_response_chunk(
    };
    let response = NvCreateChatCompletionStreamResponse {
-        id: "test-id".to_string(),
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-        choices: vec![choice],
+            id: "test-id".to_string(),
-        created: 1234567890,
+            choices: vec![choice],
-        model: "test-model".to_string(),
+            created: 1234567890,
-        system_fingerprint: Some("test-fingerprint".to_string()),
+            model: "test-model".to_string(),
-        object: "chat.completion.chunk".to_string(),
+            system_fingerprint: Some("test-fingerprint".to_string()),
-        usage: None,
+            object: "chat.completion.chunk".to_string(),
-        service_tier: None,
+            usage: None,
+            service_tier: None,
+        },
        nvext: None,
    };
@@ -125,7 +127,7 @@ mod tests {
        let mut all_content = String::new();
        while let Some(item) = output_stream.next().await {
            if let Some(ref data) = item.data {
-                for choice in &data.choices {
+                for choice in &data.inner.choices {
                    if let Some(ref r) = choice.delta.reasoning_content {
                        all_reasoning.push_str(r);
                    }
@@ -177,15 +179,15 @@ mod tests {
        assert_eq!(output_chunks.len(), 3);
        // Chunk 0: "<think>This"
-        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_0, None, Some("This"));
        // Chunk 1: " is reasoning content"
-        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().choices[0];
+        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_1, None, Some(" is reasoning content"));
        // Chunk 2: "</think> Here's my answer."
-        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().choices[0];
+        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_2, Some(" Here's my answer."), None);
    }
@@ -223,15 +225,15 @@ mod tests {
        assert_eq!(output_chunks.len(), 3);
        // Chunk 0: "<think>Only"
-        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_0, None, Some("Only"));
        // Chunk 1: " reasoning"
-        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().choices[0];
+        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_1, None, Some(" reasoning"));
        // Chunk 2: " here</think>"
-        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().choices[0];
+        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_2, None, Some(" here"));
    }
@@ -266,7 +268,7 @@ mod tests {
        // Verify that only normal content is present
        assert_eq!(output_chunks.len(), 1);
-        let output_choice = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(
            output_choice,
            Some("Just normal text without reasoning tags."),
@@ -304,8 +306,8 @@ mod tests {
        assert_eq!(output_chunks.len(), input_chunks.len());
        for (input, output) in input_chunks.iter().zip(output_chunks.iter()) {
-            let input_choice = &input.data.as_ref().unwrap().choices[0];
+            let input_choice = &input.data.as_ref().unwrap().inner.choices[0];
-            let output_choice = &output.data.as_ref().unwrap().choices[0];
+            let output_choice = &output.data.as_ref().unwrap().inner.choices[0];
            assert_choice(
                output_choice,
                input_choice.delta.content.as_ref().map(get_text),
@@ -345,7 +347,7 @@ mod tests {
        // Verify that Mistral-style reasoning is parsed correctly
        assert_eq!(output_chunks.len(), 1);
-        let output_choice = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert!(
            output_choice.delta.reasoning_content.is_some(),
@@ -422,7 +424,7 @@ mod tests {
        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    // Collect reasoning content
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
@@ -574,7 +576,7 @@ mod tests {
        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    // Collect reasoning content
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
@@ -685,7 +687,7 @@ mod tests {
        for chunk in output_chunks.iter() {
            if let Some(ref data) = chunk.data {
-                for choice in &data.choices {
+                for choice in &data.inner.choices {
                    if let Some(ref r) = choice.delta.reasoning_content {
                        all_reasoning.push_str(r);
                    }
@@ -782,7 +784,7 @@ mod tests {
        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
                    }

--- a/lib/llm/tests/test_streaming_tool_parsers.rs
+++ b/lib/llm/tests/test_streaming_tool_parsers.rs
@@ -107,14 +107,16 @@ fn load_test_data(file_path: &str) -> TestData {
            .expect("Failed to parse choices");
            let response = NvCreateChatCompletionStreamResponse {
-                id: id.clone(),
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-                choices,
+                    id: id.clone(),
-                created: 1234567890,
+                    choices,
-                model: "test-model".to_string(),
+                    created: 1234567890,
-                system_fingerprint: None,
+                    model: "test-model".to_string(),
-                object: "chat.completion.chunk".to_string(),
+                    system_fingerprint: None,
-                usage: None,
+                    object: "chat.completion.chunk".to_string(),
-                service_tier: None,
+                    usage: None,
+                    service_tier: None,
+                },
                nvext: None,
            };
@@ -231,7 +233,7 @@ fn aggregate_content_from_chunks(
    for chunk in chunks.iter() {
        if let Some(ref response_data) = chunk.data {
-            for choice in &response_data.choices {
+            for choice in &response_data.inner.choices {
                // Collect reasoning content
                if let Some(ref reasoning) = choice.delta.reasoning_content {
                    reasoning_content.push_str(reasoning);
@@ -279,7 +281,7 @@ fn validate_finish_reason(
    // Count finish_reason occurrences and track position
    for (idx, chunk) in chunks.iter().enumerate() {
        if let Some(ref response_data) = chunk.data {
-            for choice in &response_data.choices {
+            for choice in &response_data.inner.choices {
                if let Some(reason) = choice.finish_reason {
                    finish_reason_count += 1;
                    last_chunk_index = Some(idx);

--- a/lib/llm/tests/test_streaming_usage.rs
+++ b/lib/llm/tests/test_streaming_usage.rs
@@ -241,12 +241,12 @@ async fn test_streaming_without_usage() {
    for (i, chunk) in content_chunks.iter().enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Chunk {} should have usage: None when stream_options not set",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Chunk {} should have choices",
                i
            );
@@ -286,12 +286,12 @@ async fn test_streaming_with_usage_compliance() {
    for (i, chunk) in chunks.iter().take(3).enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Content chunk {} should have usage: None",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Content chunk {} should have choices",
                i
            );
@@ -301,15 +301,15 @@ async fn test_streaming_with_usage_compliance() {
    // Verify the final chunk is the usage-only chunk
    if let Some(final_response) = &chunks[3].data {
        assert!(
-            final_response.choices.is_empty(),
+            final_response.inner.choices.is_empty(),
            "Final usage chunk should have empty choices array"
        );
        assert!(
-            final_response.usage.is_some(),
+            final_response.inner.usage.is_some(),
            "Final usage chunk should have usage statistics"
        );
-        let usage = final_response.usage.as_ref().unwrap();
+        let usage = final_response.inner.usage.as_ref().unwrap();
        assert_eq!(
            usage.completion_tokens, 3,
            "Should have 3 completion tokens"
@@ -359,18 +359,18 @@ async fn test_streaming_with_continuous_usage() {
    for (i, chunk) in chunks.iter().take(3).enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_some(),
+                response.inner.usage.is_some(),
                "Content chunk {} should have usage: Some",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Content chunk {} should have choices",
                i
            );
            // Verify usage counts are properly accumulated for each chunk
-            let usage = response.usage.as_ref().unwrap();
+            let usage = response.inner.usage.as_ref().unwrap();
            assert_eq!(
                usage.completion_tokens,
                i as u32 + 1,
@@ -392,15 +392,15 @@ async fn test_streaming_with_continuous_usage() {
    // Verify the final chunk is the usage-only chunk
    if let Some(final_response) = &chunks[3].data {
        assert!(
-            final_response.choices.is_empty(),
+            final_response.inner.choices.is_empty(),
            "Final usage chunk should have empty choices array"
        );
        assert!(
-            final_response.usage.is_some(),
+            final_response.inner.usage.is_some(),
            "Final usage chunk should have usage statistics"
        );
-        let usage = final_response.usage.as_ref().unwrap();
+        let usage = final_response.inner.usage.as_ref().unwrap();
        assert_eq!(
            usage.completion_tokens, 3,
            "Should have 3 completion tokens"
@@ -464,7 +464,7 @@ async fn test_streaming_with_usage_false() {
    for (i, chunk) in content_chunks.iter().enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Chunk {} should have usage: None when include_usage is false",
                i
            );
@@ -560,7 +560,7 @@ async fn test_nonstreaming_has_usage_field() {
    // Aggregate the streaming chunks into a single non-streaming response
    // This simulates what the HTTP service does for non-streaming requests
-    let result = dynamo_async_openai::types::CreateChatCompletionResponse::from_annotated_stream(
+    let result = dynamo_llm::protocols::openai::chat_completions::NvCreateChatCompletionResponse::from_annotated_stream(
        transformed_stream,
        ParsingOptions::default(),
    )
@@ -570,12 +570,12 @@ async fn test_nonstreaming_has_usage_field() {
    let response = result.unwrap();
    assert!(
-        response.usage.is_some(),
+        response.inner.usage.is_some(),
        "Non-streaming chat completion response MUST have a usage field populated. \
         This is required for OpenAI API compliance."
    );
-    let usage = response.usage.unwrap();
+    let usage = response.inner.usage.unwrap();
    // Verify usage contains valid token counts
    // In our mock, we generated 3 tokens (from the 3 backend outputs)
@@ -725,7 +725,11 @@ async fn test_chat_streaming_with_cached_tokens_propagation() {
    assert_eq!(chunks.len(), 4, "Should have 3 content + 1 usage chunk");
    if let Some(final_resp) = &chunks[3].data {
-        let usage = final_resp.usage.as_ref().expect("Usage must be present");
+        let usage = final_resp
+            .inner
+            .usage
+            .as_ref()
+            .expect("Usage must be present");
        let cached = usage
            .prompt_tokens_details
            .as_ref()

--- a/lib/llm/tests/tool_choice.rs
+++ b/lib/llm/tests/tool_choice.rs
@@ -157,7 +157,7 @@ async fn test_named_tool_choice_parses_json() {
        .expect("choice generation");
    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let choice = &response.choices[0];
+    let choice = &response.inner.choices[0];
    assert_eq!(
        choice.finish_reason,
@@ -199,7 +199,7 @@ async fn test_required_tool_choice_parses_json_array() {
        .expect("choice generation");
    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let choice = &response.choices[0];
+    let choice = &response.inner.choices[0];
    assert_eq!(
        choice.finish_reason,
@@ -259,7 +259,7 @@ async fn test_tool_choice_parse_failure_returns_as_content() {
        .expect("choice generation");
    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let delta = &response.choices[0].delta;
+    let delta = &response.inner.choices[0].delta;
    // Jail stream behavior: if parsing fails, return accumulated content as-is
    // This matches marker-based FC behavior
@@ -317,11 +317,11 @@ async fn test_streaming_named_tool_buffers_until_finish() {
    let response = &all_responses[0];
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Stop)
    );
-    let tool_calls = response.choices[0].delta.tool_calls.as_ref().unwrap();
+    let tool_calls = response.inner.choices[0].delta.tool_calls.as_ref().unwrap();
    assert_eq!(tool_calls.len(), 1);
    assert_eq!(
        tool_calls[0].function.as_ref().unwrap().name.as_deref(),
@@ -384,11 +384,11 @@ async fn test_streaming_required_tool_parallel() {
    let response = &all_responses[0];
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ToolCalls)
    );
-    let tool_calls = response.choices[0].delta.tool_calls.as_ref().unwrap();
+    let tool_calls = response.inner.choices[0].delta.tool_calls.as_ref().unwrap();
    assert_eq!(tool_calls.len(), 2);
    assert_eq!(
@@ -445,8 +445,12 @@ fn test_no_tool_choice_outputs_normal_text() {
        .expect("normal text");
    assert_eq!(
-        response.choices[0].delta.content.as_ref().map(get_text),
+        response.inner.choices[0]
+            .delta
+            .content
+            .as_ref()
+            .map(get_text),
        Some("Hello world")
    );
-    assert!(response.choices[0].delta.tool_calls.is_none());
+    assert!(response.inner.choices[0].delta.tool_calls.is_none());
 }
--- a/lib/llm/tests/tool_choice_finish_reasons.rs
+++ b/lib/llm/tests/tool_choice_finish_reasons.rs
@@ -116,7 +116,7 @@ async fn test_named_tool_choice_preserves_length_finish_reason() {
    // Critical: Length finish reason should be preserved, NOT replaced with Stop
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Length),
        "Length finish reason must be preserved for tool_choice=named"
    );
@@ -139,7 +139,7 @@ fn test_required_tool_choice_preserves_length_finish_reason() {
    // Critical: Length finish reason should be preserved, NOT replaced with ToolCalls
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Length),
        "Length finish reason must be preserved for tool_choice=required"
    );
@@ -169,7 +169,7 @@ fn test_named_tool_choice_preserves_content_filter() {
    // Critical: ContentFilter finish reason should be preserved
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ContentFilter),
        "ContentFilter finish reason must be preserved for tool_choice=named"
    );
@@ -192,7 +192,7 @@ fn test_required_tool_choice_preserves_content_filter() {
    // Critical: ContentFilter finish reason should be preserved
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ContentFilter),
        "ContentFilter finish reason must be preserved for tool_choice=required"
    );
@@ -222,7 +222,7 @@ fn test_named_tool_choice_normal_stop_becomes_stop() {
    // Normal completion: Stop should remain Stop for named tool choice
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Stop),
    );
 }
@@ -247,7 +247,7 @@ async fn test_required_tool_choice_normal_stop_becomes_tool_calls() {
    // Normal completion: Stop should become ToolCalls for required tool choice
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ToolCalls),
    );
 }