refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)

2887cd1c · ishandhanani · GitHub · d6136f4a · 2887cd1c · 2887cd1c
Unverified Commit 2887cd1c authored Mar 30, 2026 by ishandhanani Committed by GitHub Mar 30, 2026
12 changed files
--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -214,50 +214,9 @@ pub struct AgentHints {
    pub latency_sensitivity: Option<f64>,
 }

-/// Anthropic-style cache control hint for prefix pinning with TTL.
-#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
-pub struct CacheControl {
-    #[serde(rename = "type")]
-    pub control_type: CacheControlType,
-    /// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub ttl: Option<String>,
-}
-
-#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
-#[serde(rename_all = "lowercase")]
-pub enum CacheControlType {
-    #[default]
-    Ephemeral,
-    #[serde(other)]
-    Unknown,
-}
-
-const MIN_TTL_SECONDS: u64 = 300;
-const MAX_TTL_SECONDS: u64 = 3600;
-
-impl CacheControl {
-    /// Parse TTL string to seconds, clamped to [300, 3600].
-    ///
-    /// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
-    /// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
-    /// Unrecognized strings default to 300s.
-    pub fn ttl_seconds(&self) -> u64 {
-        let raw = match self.ttl.as_deref() {
-            None => return MIN_TTL_SECONDS,
-            Some("5m") => 300,
-            Some("1h") => 3600,
-            Some(other) => match other.parse::<u64>() {
-                Ok(secs) => secs,
-                Err(_) => {
-                    tracing::warn!("Unrecognized TTL '{}', defaulting to 300s", other);
-                    return MIN_TTL_SECONDS;
-                }
-            },
-        };
-        raw.clamp(MIN_TTL_SECONDS, MAX_TTL_SECONDS)
-    }
-}
+// Re-export CacheControl types from dynamo-async-openai where they are canonically defined
+// alongside the Anthropic protocol types they originate from.
+pub use dynamo_async_openai::types::anthropic::{CacheControl, CacheControlType};

 impl Default for NvExt {
    fn default() -> Self {

--- a/lib/llm/src/protocols/openai/responses/mod.rs
+++ b/lib/llm/src/protocols/openai/responses/mod.rs
@@ -696,8 +696,8 @@ pub fn chat_completion_to_response(
    nv_resp: NvCreateChatCompletionResponse,
    params: &ResponseParams,
 ) -> Result<NvResponse, anyhow::Error> {
-    let chat_resp = nv_resp;
-    let nvext = chat_resp.nvext.clone();
+    let nvext = nv_resp.nvext.clone();
+    let chat_resp = nv_resp.inner;
    let message_id = format!("msg_{}", Uuid::new_v4().simple());
    let response_id = format!("resp_{}", Uuid::new_v4().simple());

@@ -1163,6 +1163,7 @@ mod tests {
    fn test_into_nvresponse_from_chat_response() {
        let now = 1_726_000_000;
        let chat_resp = NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                id: "chatcmpl-xyz".into(),
                choices: vec![dynamo_async_openai::types::ChatChoice {
                    index: 0,
@@ -1189,6 +1190,7 @@ mod tests {
                system_fingerprint: None,
                object: "chat.completion".to_string(),
                usage: None,
+            },
            nvext: None,
        };

@@ -1218,6 +1220,7 @@ mod tests {
    fn test_response_with_tool_calls() {
        let now = 1_726_000_000;
        let chat_resp = NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                id: "chatcmpl-xyz".into(),
                choices: vec![dynamo_async_openai::types::ChatChoice {
                    index: 0,
@@ -1247,6 +1250,7 @@ mod tests {
                system_fingerprint: None,
                object: "chat.completion".to_string(),
                usage: None,
+            },
            nvext: None,
        };

@@ -1432,6 +1436,7 @@ thinking
        };

        let chat_resp = NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                choices: vec![],
                created: 0,
                id: "test".into(),
@@ -1440,6 +1445,7 @@ thinking
                system_fingerprint: None,
                object: "chat.completion".into(),
                usage: None,
+            },
            nvext: None,
        };

@@ -1463,6 +1469,7 @@ thinking
        };

        let chat_resp = NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                choices: vec![],
                created: 0,
                id: "test".into(),
@@ -1471,6 +1478,7 @@ thinking
                system_fingerprint: None,
                object: "chat.completion".into(),
                usage: None,
+            },
            nvext: None,
        };

@@ -1489,6 +1497,7 @@ thinking
        };

        let chat_resp = NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                choices: vec![],
                created: 0,
                id: "test".into(),
@@ -1497,6 +1506,7 @@ thinking
                system_fingerprint: None,
                object: "chat.completion".into(),
                usage: None,
+            },
            nvext: None,
        };

@@ -1555,6 +1565,7 @@ thinking
            ChatChoice, ChatCompletionMessageContent, ChatCompletionResponseMessage, FinishReason,
        };
        NvCreateChatCompletionResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
                choices: vec![ChatChoice {
                    index: 0,
                    #[allow(deprecated)]
@@ -1578,6 +1589,7 @@ thinking
                system_fingerprint: None,
                object: "chat.completion".into(),
                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/src/protocols/openai/responses/stream_converter.rs
+++ b/lib/llm/src/protocols/openai/responses/stream_converter.rs
@@ -183,7 +183,7 @@ impl ResponseStreamConverter {
        let mut events = Vec::new();

        // Capture usage stats from the final chunk (sent when stream_options.include_usage=true)
-        if let Some(ref u) = chunk.usage {
+        if let Some(ref u) = chunk.inner.usage {
            self.usage = Some(ResponseUsage {
                input_tokens: u.prompt_tokens,
                input_tokens_details: InputTokenDetails {
@@ -205,7 +205,7 @@ impl ResponseStreamConverter {
            });
        }

-        for choice in &chunk.choices {
+        for choice in &chunk.inner.choices {
            let delta = &choice.delta;

            // Handle text content deltas — extract text from the enum
@@ -685,6 +685,7 @@ mod tests {
    ) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                id: "chat-1".into(),
                choices: vec![ChatChoiceStream {
                    index: 0,
@@ -714,6 +715,7 @@ mod tests {
                system_fingerprint: None,
                object: "chat.completion.chunk".into(),
                usage: None,
+            },
            nvext: None,
        }
    }
@@ -721,6 +723,7 @@ mod tests {
    fn text_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                id: "chat-1".into(),
                choices: vec![ChatChoiceStream {
                    index: 0,
@@ -742,6 +745,7 @@ mod tests {
                system_fingerprint: None,
                object: "chat.completion.chunk".into(),
                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/tests/aggregators.rs
+++ b/lib/llm/tests/aggregators.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

-use dynamo_async_openai::types::ChatCompletionMessageContent;
+use dynamo_async_openai::types::{
+    ChatChoiceStream, ChatCompletionMessageContent, ChatCompletionStreamResponseDelta,
+    CreateChatCompletionStreamResponse, Role,
+};
 use dynamo_llm::protocols::{
-    ContentProvider, DataStream,
+    Annotated, ContentProvider, DataStream,
    codec::{Message, SseCodecError, create_message_stream},
    openai::{
        ParsingOptions,
-        chat_completions::{NvCreateChatCompletionResponse, aggregator::ChatCompletionAggregator},
+        chat_completions::{
+            NvCreateChatCompletionResponse, NvCreateChatCompletionStreamResponse,
+            aggregator::ChatCompletionAggregator,
+        },
        completions::NvCreateCompletionResponse,
    },
 };
@@ -45,6 +51,7 @@ async fn test_openai_chat_stream() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -70,6 +77,7 @@ async fn test_openai_chat_edge_case_multi_line_data() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -95,6 +103,7 @@ async fn test_openai_chat_edge_case_comments_per_response() {
    assert_eq!(
        get_text(
            result
+                .inner
                .choices
                .first()
                .unwrap()
@@ -138,3 +147,113 @@ async fn test_openai_cmpl_stream() {
        " This is a question that is often asked by those outside of AI research and development"
    );
 }
+
+// ===================================
+// nvext aggregation regression tests
+// ===================================
+
+#[allow(deprecated)]
+fn make_stream_delta(
+    content: Option<&str>,
+    nvext: Option<serde_json::Value>,
+) -> Annotated<NvCreateChatCompletionStreamResponse> {
+    Annotated::from_data(NvCreateChatCompletionStreamResponse {
+        inner: CreateChatCompletionStreamResponse {
+            id: "test-id".to_string(),
+            choices: if let Some(text) = content {
+                vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: Some(ChatCompletionMessageContent::Text(text.to_string())),
+                        function_call: None,
+                        tool_calls: None,
+                        role: Some(Role::Assistant),
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: None,
+                    stop_reason: None,
+                    logprobs: None,
+                }]
+            } else {
+                vec![]
+            },
+            created: 1234567890,
+            model: "test-model".to_string(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion.chunk".to_string(),
+            usage: None,
+        },
+        nvext,
+    })
+}
+
+/// Verify that nvext set on a stream delta survives aggregation into the final response.
+#[tokio::test]
+async fn test_nvext_passthrough_aggregation() {
+    let nvext_value = serde_json::json!({"custom_field": "test_value"});
+
+    let deltas = vec![
+        make_stream_delta(Some("Hello"), None),
+        make_stream_delta(Some(" world"), Some(nvext_value.clone())),
+        make_stream_delta(Some("!"), None),
+    ];
+
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+
+    assert_eq!(result.nvext, Some(nvext_value));
+    assert_eq!(
+        get_text(
+            result
+                .inner
+                .choices
+                .first()
+                .unwrap()
+                .message
+                .content
+                .as_ref()
+                .unwrap()
+        ),
+        "Hello world!"
+    );
+}
+
+/// Verify that the last non-None nvext wins when multiple deltas carry nvext.
+#[tokio::test]
+async fn test_nvext_last_value_wins() {
+    let first_nvext = serde_json::json!({"version": 1});
+    let last_nvext = serde_json::json!({"version": 2});
+
+    let deltas = vec![
+        make_stream_delta(Some("a"), Some(first_nvext)),
+        make_stream_delta(Some("b"), None),
+        make_stream_delta(Some("c"), Some(last_nvext.clone())),
+    ];
+
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+
+    assert_eq!(result.nvext, Some(last_nvext));
+}
+
+/// Verify that nvext remains None when no delta carries it.
+#[tokio::test]
+async fn test_nvext_none_when_absent() {
+    let deltas = vec![make_stream_delta(Some("hello"), None)];
+
+    let stream = futures::stream::iter(deltas);
+    let result =
+        NvCreateChatCompletionResponse::from_annotated_stream(stream, ParsingOptions::default())
+            .await
+            .unwrap();
+
+    assert_eq!(result.nvext, None);
+}
--- a/lib/llm/tests/logprob_analysis_integration.rs
+++ b/lib/llm/tests/logprob_analysis_integration.rs
@@ -397,6 +397,7 @@ fn create_response_with_linear_probs(
    };

    NvCreateChatCompletionStreamResponse {
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices: vec![choice],
            created: 1234567890,
@@ -405,6 +406,7 @@ fn create_response_with_linear_probs(
            system_fingerprint: None,
            object: "chat.completion.chunk".to_string(),
            usage: None,
+        },
        nvext: None,
    }
 }
@@ -479,6 +481,7 @@ fn create_multi_choice_response(
        .collect();

    NvCreateChatCompletionStreamResponse {
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices,
            created: 1234567890,
@@ -487,6 +490,7 @@ fn create_multi_choice_response(
            system_fingerprint: None,
            object: "chat.completion.chunk".to_string(),
            usage: None,
+        },
        nvext: None,
    }
 }
--- a/lib/llm/tests/postprocessor_parsing_stream.rs
+++ b/lib/llm/tests/postprocessor_parsing_stream.rs
@@ -192,7 +192,7 @@ async fn postprocessor_parsing_stream_replays_interval_20_fixture() {
            continue;
        };

-        for choice in &output_data.choices {
+        for choice in &output_data.inner.choices {
            if let Some(reasoning_content) = &choice.delta.reasoning_content {
                reasoning.push_str(reasoning_content);
            }

--- a/lib/llm/tests/test_jail.rs
+++ b/lib/llm/tests/test_jail.rs
@@ -48,6 +48,7 @@ mod tests {
            };

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: "test-id".to_string(),
                    choices: vec![choice],
                    created: 1234567890,
@@ -56,6 +57,7 @@ mod tests {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -89,6 +91,7 @@ mod tests {
            };

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: "test-id".to_string(),
                    choices: vec![choice],
                    created: 1234567890,
@@ -97,6 +100,7 @@ mod tests {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -134,6 +138,7 @@ mod tests {
            };

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: "test-id".to_string(),
                    choices: vec![choice],
                    created: 1234567890,
@@ -142,6 +147,7 @@ mod tests {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -180,6 +186,7 @@ mod tests {
                .collect();

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: "test-id".to_string(),
                    choices,
                    created: 1234567890,
@@ -188,6 +195,7 @@ mod tests {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -226,6 +234,7 @@ mod tests {
                .collect();

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: "test-id".to_string(),
                    choices,
                    created: 1234567890,
@@ -234,6 +243,7 @@ mod tests {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -254,7 +264,7 @@ mod tests {
            let content = result
                .data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.content.as_ref())
                .expect("Expected content in result");

@@ -276,7 +286,7 @@ mod tests {
            let tool_calls = result
                .data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.tool_calls.as_ref())
                .expect("Expected tool calls in result");

@@ -313,7 +323,7 @@ mod tests {
        #[allow(dead_code)]
        pub fn assert_empty_emission(result: &Annotated<NvCreateChatCompletionStreamResponse>) {
            if let Some(data) = &result.data
-                && let Some(choice) = data.choices.first()
+                && let Some(choice) = data.inner.choices.first()
            {
                assert!(
                    choice.delta.content.is_none()
@@ -343,7 +353,7 @@ mod tests {
                .filter_map(|r| {
                    r.data
                        .as_ref()
-                        .and_then(|d| d.choices.first())
+                        .and_then(|d| d.inner.choices.first())
                        .and_then(|c| c.delta.content.as_ref())
                })
                .map(extract_text)
@@ -356,7 +366,7 @@ mod tests {
            result
                .data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.content.as_ref())
                .and_then(|content| match content {
                    ChatCompletionMessageContent::Text(text) => Some(text.clone()),
@@ -370,7 +380,7 @@ mod tests {
            result
                .data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.tool_calls.as_ref())
                .map(|tc| !tc.is_empty())
                .unwrap_or(false)
@@ -382,7 +392,7 @@ mod tests {
            result
                .data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.content.as_ref())
                .map(|content| !extract_text(content).is_empty())
                .unwrap_or(false)
@@ -422,7 +432,7 @@ mod tests {

        // First chunk should pass through
        assert_eq!(
-            results[0].data.as_ref().unwrap().choices[0]
+            results[0].data.as_ref().unwrap().inner.choices[0]
                .delta
                .content
                .as_ref()
@@ -431,7 +441,9 @@ mod tests {
        );

        // When jail ends, accumulated content should be released
-        let unjailed_content = &results[1].data.as_ref().unwrap().choices[0].delta.content;
+        let unjailed_content = &results[1].data.as_ref().unwrap().inner.choices[0]
+            .delta
+            .content;
        assert!(unjailed_content.is_some());
        assert!(
            extract_text(unjailed_content.as_ref().unwrap())
@@ -440,7 +452,7 @@ mod tests {

        // Last chunk should pass through normally
        assert_eq!(
-            results[2].data.as_ref().unwrap().choices[0]
+            results[2].data.as_ref().unwrap().inner.choices[0]
                .delta
                .content
                .as_ref()
@@ -476,7 +488,7 @@ mod tests {
        // Check if tool calls were parsed
        if let Some(last_result) = results.last()
            && let Some(ref response_data) = last_result.data
-            && let Some(ref tool_calls) = response_data.choices[0].delta.tool_calls
+            && let Some(ref tool_calls) = response_data.inner.choices[0].delta.tool_calls
        {
            assert!(!tool_calls.as_slice().is_empty());
            assert_eq!(
@@ -514,7 +526,7 @@ mod tests {

        // First chunk should pass through
        assert_eq!(
-            results[0].data.as_ref().unwrap().choices[0]
+            results[0].data.as_ref().unwrap().inner.choices[0]
                .delta
                .content
                .as_ref()
@@ -523,7 +535,7 @@ mod tests {
        );

        // Second chunk should contain the accumulated jailed content
-        let jailed = results[1].data.as_ref().unwrap().choices[0]
+        let jailed = results[1].data.as_ref().unwrap().inner.choices[0]
            .delta
            .content
            .as_ref()
@@ -1226,7 +1238,7 @@ mod tests {
            .find(|r| {
                r.data
                    .as_ref()
-                    .and_then(|d| d.choices.first())
+                    .and_then(|d| d.inner.choices.first())
                    .map(|c| c.delta.tool_calls.is_some())
                    .unwrap_or(false)
            })
@@ -1247,7 +1259,7 @@ mod tests {
        );

        // Verify tool call was parsed correctly
-        let tool_calls = &tool_call_chunk.data.as_ref().unwrap().choices[0]
+        let tool_calls = &tool_call_chunk.data.as_ref().unwrap().inner.choices[0]
            .delta
            .tool_calls;
        assert!(tool_calls.is_some(), "Should have tool calls");
@@ -1318,20 +1330,20 @@ mod tests {
        // Verify inner response metadata carries forward real stream values (not placeholders)
        let inner = accumulated_chunk.data.as_ref().unwrap();
        assert_eq!(
-            inner.id, "test-id",
+            inner.inner.id, "test-id",
            "Inner response id should carry forward from real stream chunks, not be 'stream-end'"
        );
        assert_eq!(
-            inner.model, "test-model",
+            inner.inner.model, "test-model",
            "Inner response model should carry forward from real stream chunks, not be 'unknown'"
        );
        assert_eq!(
-            inner.created, 1234567890,
+            inner.inner.created, 1234567890,
            "Inner response created should carry forward from real stream chunks, not be 0"
        );

        // Verify accumulated content is returned
-        let content = &inner.choices[0].delta.content;
+        let content = &inner.inner.choices[0].delta.content;
        assert!(content.is_some(), "Should have accumulated content");
        let content = content.as_ref().unwrap();
        assert!(
@@ -1379,7 +1391,7 @@ mod tests {
            .find(|r| {
                r.data
                    .as_ref()
-                    .and_then(|d| d.choices.first())
+                    .and_then(|d| d.inner.choices.first())
                    .map(|c| c.delta.tool_calls.is_some())
                    .unwrap_or(false)
            })
@@ -1544,7 +1556,7 @@ mod tests {
        let choice_1_chunks: Vec<_> = results
            .iter()
            .filter_map(|r| r.data.as_ref())
-            .flat_map(|d| &d.choices)
+            .flat_map(|d| &d.inner.choices)
            .filter(|c| c.index == 1 && c.delta.content.is_some())
            .collect();

@@ -1558,7 +1570,7 @@ mod tests {
        let choice_0_tool_calls: Vec<_> = results
            .iter()
            .filter_map(|r| r.data.as_ref())
-            .flat_map(|d| &d.choices)
+            .flat_map(|d| &d.inner.choices)
            .filter(|c| c.index == 0 && c.finish_reason == Some(FinishReason::ToolCalls))
            .collect();

@@ -1571,7 +1583,7 @@ mod tests {
        let choice_2_tool_calls: Vec<_> = results
            .iter()
            .filter_map(|r| r.data.as_ref())
-            .flat_map(|d| &d.choices)
+            .flat_map(|d| &d.inner.choices)
            .filter(|c| c.index == 2 && c.finish_reason == Some(FinishReason::ToolCalls))
            .collect();

@@ -1614,7 +1626,7 @@ mod tests {
        let mut tool_call_responses: Vec<_> = results
            .iter()
            .filter_map(|r| r.data.as_ref())
-            .flat_map(|d| &d.choices)
+            .flat_map(|d| &d.inner.choices)
            .filter(|c| c.finish_reason == Some(FinishReason::ToolCalls))
            .collect();

@@ -1659,7 +1671,7 @@ mod tests {
            let run_responses: Vec<_> = run_results
                .iter()
                .filter_map(|r| r.data.as_ref())
-                .flat_map(|d| &d.choices)
+                .flat_map(|d| &d.inner.choices)
                .filter(|c| c.finish_reason == Some(FinishReason::ToolCalls))
                .collect();

@@ -1683,8 +1695,8 @@ mod tests {

        // Modify the inner data to be a usage-only chunk
        if let Some(ref mut data) = usage_chunk.data {
-            data.choices.clear();
-            data.usage = Some(CompletionUsage {
+            data.inner.choices.clear();
+            data.inner.usage = Some(CompletionUsage {
                prompt_tokens: 11,
                completion_tokens: 3,
                total_tokens: 14,
@@ -1703,7 +1715,7 @@ mod tests {
        assert_eq!(results.len(), 2, "Should have exactly 2 chunks");

        // First chunk should be content chunk
-        let content = results[0].data.as_ref().unwrap().choices[0]
+        let content = results[0].data.as_ref().unwrap().inner.choices[0]
            .delta
            .content
            .as_ref()
@@ -1716,10 +1728,17 @@ mod tests {

        // Second chunk should be usage-only chunk
        assert!(
-            results[1].data.as_ref().unwrap().choices.is_empty(),
+            results[1].data.as_ref().unwrap().inner.choices.is_empty(),
            "Usage chunk should have no choices"
        );
-        let usage = results[1].data.as_ref().unwrap().usage.as_ref().unwrap();
+        let usage = results[1]
+            .data
+            .as_ref()
+            .unwrap()
+            .inner
+            .usage
+            .as_ref()
+            .unwrap();
        assert_eq!(usage.prompt_tokens, 11);
        assert_eq!(usage.completion_tokens, 3);
        assert_eq!(usage.total_tokens, 14);
@@ -1896,7 +1915,7 @@ mod tests {
        let has_analysis_text = results.iter().any(|r| {
            r.data
                .as_ref()
-                .and_then(|d| d.choices.first())
+                .and_then(|d| d.inner.choices.first())
                .and_then(|c| c.delta.content.as_ref())
                .map(|content| {
                    test_utils::extract_text(content)
@@ -1950,7 +1969,7 @@ mod tests {
            let Some(data) = result.data else {
                continue;
            };
-            for choice in data.choices {
+            for choice in data.inner.choices {
                if let Some(content) = choice.delta.content {
                    assert!(
                        !test_utils::extract_text(&content).contains("<｜tool▁calls▁end｜>"),
@@ -2024,7 +2043,7 @@ mod tests {
            let Some(data) = result.data else {
                continue;
            };
-            for choice in data.choices {
+            for choice in data.inner.choices {
                if let Some(content) = choice.delta.content {
                    assert!(
                        !test_utils::extract_text(&content).contains("<｜tool▁calls▁end｜>"),
@@ -2221,7 +2240,7 @@ mod tests {
            .filter_map(|r| {
                r.data
                    .as_ref()
-                    .and_then(|d| d.choices.first())
+                    .and_then(|d| d.inner.choices.first())
                    .and_then(|c| c.delta.content.as_ref())
            })
            .filter(|content| {
@@ -2241,7 +2260,7 @@ mod tests {
            .filter_map(|r| {
                r.data
                    .as_ref()
-                    .and_then(|d| d.choices.first())
+                    .and_then(|d| d.inner.choices.first())
                    .and_then(|c| c.delta.content.as_ref())
            })
            .find(|content| {
@@ -2264,7 +2283,7 @@ mod tests {
            .filter(|r| {
                r.data
                    .as_ref()
-                    .and_then(|d| d.choices.first())
+                    .and_then(|d| d.inner.choices.first())
                    .and_then(|c| c.delta.tool_calls.as_ref())
                    .map(|tc| !tc.is_empty())
                    .unwrap_or(false)
@@ -2397,6 +2416,7 @@ mod parallel_jail_tests {
            .collect();

        let response = NvCreateChatCompletionStreamResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                id: "test-id".to_string(),
                choices,
                created: 1234567890,
@@ -2405,6 +2425,7 @@ mod parallel_jail_tests {
                object: "chat.completion.chunk".to_string(),
                usage: None,
                service_tier: None,
+            },
            nvext: None,
        };

@@ -2428,7 +2449,7 @@ mod parallel_jail_tests {
            .filter(|r| {
                r.data
                    .as_ref()
-                    .is_some_and(|d| d.choices.iter().any(|c| c.delta.tool_calls.is_some()))
+                    .is_some_and(|d| d.inner.choices.iter().any(|c| c.delta.tool_calls.is_some()))
            })
            .collect();

@@ -2441,7 +2462,7 @@ mod parallel_jail_tests {
        let mut all_tool_calls = Vec::new();
        for result in &tool_call_results {
            if let Some(ref data) = result.data {
-                for choice in &data.choices {
+                for choice in &data.inner.choices {
                    if let Some(ref tool_calls) = choice.delta.tool_calls {
                        all_tool_calls.extend(tool_calls.iter());
                    }
@@ -2635,7 +2656,7 @@ mod parallel_jail_tests {
        // Should have normal text before tool calls
        let normal_text_before = results.iter().find(|r| {
            r.data.as_ref().is_some_and(|d| {
-                d.choices.iter().any(|c| {
+                d.inner.choices.iter().any(|c| {
                    c.delta.content.as_ref().is_some_and(|content| {
                        test_utils::extract_text(content).contains("I'll check the weather")
                    })
@@ -2664,7 +2685,7 @@ mod parallel_jail_tests {
        // Should have normal text after tool calls
        let normal_text_after = results.iter().find(|r| {
            r.data.as_ref().is_some_and(|d| {
-                d.choices.iter().any(|c| {
+                d.inner.choices.iter().any(|c| {
                    c.delta.content.as_ref().is_some_and(|content| {
                        test_utils::extract_text(content).contains("Let me get that information")
                    })
@@ -2705,7 +2726,8 @@ mod parallel_jail_tests {
            .iter()
            .map(|r| {
                r.data.as_ref().map_or(0, |d| {
-                    d.choices
+                    d.inner
+                        .choices
                        .iter()
                        .map(|c| c.delta.tool_calls.as_ref().map_or(0, |tc| tc.len()))
                        .sum::<usize>()
@@ -2795,7 +2817,8 @@ mod parallel_jail_tests {
            .iter()
            .map(|r| {
                r.data.as_ref().map_or(0, |d| {
-                    d.choices
+                    d.inner
+                        .choices
                        .iter()
                        .map(|c| c.delta.tool_calls.as_ref().map_or(0, |tc| tc.len()))
                        .sum::<usize>()
@@ -2865,7 +2888,8 @@ mod parallel_jail_tests {
            .iter()
            .map(|r| {
                r.data.as_ref().map_or(0, |d| {
-                    d.choices
+                    d.inner
+                        .choices
                        .iter()
                        .map(|c| c.delta.tool_calls.as_ref().map_or(0, |tc| tc.len()))
                        .sum::<usize>()
@@ -2881,14 +2905,14 @@ mod parallel_jail_tests {
            .filter(|r| {
                r.data
                    .as_ref()
-                    .is_some_and(|d| d.choices.iter().any(|c| c.delta.tool_calls.is_some()))
+                    .is_some_and(|d| d.inner.choices.iter().any(|c| c.delta.tool_calls.is_some()))
            })
            .collect();

        if let Some(result) = tool_call_results.first()
            && let Some(ref data) = result.data
        {
-            for choice in &data.choices {
+            for choice in &data.inner.choices {
                if let Some(ref tool_calls) = choice.delta.tool_calls {
                    for tool_call in tool_calls {
                        if let Some(ref function) = tool_call.function
@@ -2943,7 +2967,8 @@ mod parallel_jail_tests {
            .iter()
            .map(|r| {
                r.data.as_ref().map_or(0, |d| {
-                    d.choices
+                    d.inner
+                        .choices
                        .iter()
                        .map(|c| c.delta.tool_calls.as_ref().map_or(0, |tc| tc.len()))
                        .sum::<usize>()
@@ -2990,7 +3015,8 @@ mod parallel_jail_tests {
        // Should try to parse whatever content was accumulated
        let has_some_content = results.iter().any(|r| {
            r.data.as_ref().is_some_and(|d| {
-                d.choices
+                d.inner
+                    .choices
                    .iter()
                    .any(|c| c.delta.content.is_some() || c.delta.tool_calls.is_some())
            })
@@ -3025,7 +3051,7 @@ mod parallel_jail_tests {
        // Should have normal text content but no tool calls
        let has_normal_text = results.iter().any(|r| {
            r.data.as_ref().is_some_and(|d| {
-                d.choices.iter().any(|c| {
+                d.inner.choices.iter().any(|c| {
                    c.delta.content.as_ref().is_some_and(|content| {
                        test_utils::extract_text(content).contains("I'll help you")
                            || test_utils::extract_text(content).contains("don't need any tools")
@@ -3040,7 +3066,8 @@ mod parallel_jail_tests {
            .iter()
            .map(|r| {
                r.data.as_ref().map_or(0, |d| {
-                    d.choices
+                    d.inner
+                        .choices
                        .iter()
                        .map(|c| c.delta.tool_calls.as_ref().map_or(0, |tc| tc.len()))
                        .sum::<usize>()

--- a/lib/llm/tests/test_reasoning_parser.rs
+++ b/lib/llm/tests/test_reasoning_parser.rs
@@ -39,6 +39,7 @@ fn create_mock_response_chunk(
    };

    let response = NvCreateChatCompletionStreamResponse {
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
            id: "test-id".to_string(),
            choices: vec![choice],
            created: 1234567890,
@@ -47,6 +48,7 @@ fn create_mock_response_chunk(
            object: "chat.completion.chunk".to_string(),
            usage: None,
            service_tier: None,
+        },
        nvext: None,
    };

@@ -125,7 +127,7 @@ mod tests {
        let mut all_content = String::new();
        while let Some(item) = output_stream.next().await {
            if let Some(ref data) = item.data {
-                for choice in &data.choices {
+                for choice in &data.inner.choices {
                    if let Some(ref r) = choice.delta.reasoning_content {
                        all_reasoning.push_str(r);
                    }
@@ -177,15 +179,15 @@ mod tests {
        assert_eq!(output_chunks.len(), 3);

        // Chunk 0: "<think>This"
-        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_0, None, Some("This"));

        // Chunk 1: " is reasoning content"
-        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().choices[0];
+        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_1, None, Some(" is reasoning content"));

        // Chunk 2: "</think> Here's my answer."
-        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().choices[0];
+        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_2, Some(" Here's my answer."), None);
    }

@@ -223,15 +225,15 @@ mod tests {
        assert_eq!(output_chunks.len(), 3);

        // Chunk 0: "<think>Only"
-        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice_0 = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_0, None, Some("Only"));

        // Chunk 1: " reasoning"
-        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().choices[0];
+        let output_choice_1 = &output_chunks[1].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_1, None, Some(" reasoning"));

        // Chunk 2: " here</think>"
-        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().choices[0];
+        let output_choice_2 = &output_chunks[2].data.as_ref().unwrap().inner.choices[0];
        assert_choice(output_choice_2, None, Some(" here"));
    }

@@ -266,7 +268,7 @@ mod tests {

        // Verify that only normal content is present
        assert_eq!(output_chunks.len(), 1);
-        let output_choice = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];
        assert_choice(
            output_choice,
            Some("Just normal text without reasoning tags."),
@@ -304,8 +306,8 @@ mod tests {
        assert_eq!(output_chunks.len(), input_chunks.len());

        for (input, output) in input_chunks.iter().zip(output_chunks.iter()) {
-            let input_choice = &input.data.as_ref().unwrap().choices[0];
-            let output_choice = &output.data.as_ref().unwrap().choices[0];
+            let input_choice = &input.data.as_ref().unwrap().inner.choices[0];
+            let output_choice = &output.data.as_ref().unwrap().inner.choices[0];
            assert_choice(
                output_choice,
                input_choice.delta.content.as_ref().map(get_text),
@@ -345,7 +347,7 @@ mod tests {

        // Verify that Mistral-style reasoning is parsed correctly
        assert_eq!(output_chunks.len(), 1);
-        let output_choice = &output_chunks[0].data.as_ref().unwrap().choices[0];
+        let output_choice = &output_chunks[0].data.as_ref().unwrap().inner.choices[0];

        assert!(
            output_choice.delta.reasoning_content.is_some(),
@@ -422,7 +424,7 @@ mod tests {

        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    // Collect reasoning content
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
@@ -574,7 +576,7 @@ mod tests {

        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    // Collect reasoning content
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
@@ -685,7 +687,7 @@ mod tests {

        for chunk in output_chunks.iter() {
            if let Some(ref data) = chunk.data {
-                for choice in &data.choices {
+                for choice in &data.inner.choices {
                    if let Some(ref r) = choice.delta.reasoning_content {
                        all_reasoning.push_str(r);
                    }
@@ -782,7 +784,7 @@ mod tests {

        for chunk in output_chunks.iter() {
            if let Some(ref response_data) = chunk.data {
-                for choice in &response_data.choices {
+                for choice in &response_data.inner.choices {
                    if let Some(ref reasoning) = choice.delta.reasoning_content {
                        all_reasoning.push_str(reasoning);
                    }

--- a/lib/llm/tests/test_streaming_tool_parsers.rs
+++ b/lib/llm/tests/test_streaming_tool_parsers.rs
@@ -107,6 +107,7 @@ fn load_test_data(file_path: &str) -> TestData {
            .expect("Failed to parse choices");

            let response = NvCreateChatCompletionStreamResponse {
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
                    id: id.clone(),
                    choices,
                    created: 1234567890,
@@ -115,6 +116,7 @@ fn load_test_data(file_path: &str) -> TestData {
                    object: "chat.completion.chunk".to_string(),
                    usage: None,
                    service_tier: None,
+                },
                nvext: None,
            };

@@ -231,7 +233,7 @@ fn aggregate_content_from_chunks(

    for chunk in chunks.iter() {
        if let Some(ref response_data) = chunk.data {
-            for choice in &response_data.choices {
+            for choice in &response_data.inner.choices {
                // Collect reasoning content
                if let Some(ref reasoning) = choice.delta.reasoning_content {
                    reasoning_content.push_str(reasoning);
@@ -279,7 +281,7 @@ fn validate_finish_reason(
    // Count finish_reason occurrences and track position
    for (idx, chunk) in chunks.iter().enumerate() {
        if let Some(ref response_data) = chunk.data {
-            for choice in &response_data.choices {
+            for choice in &response_data.inner.choices {
                if let Some(reason) = choice.finish_reason {
                    finish_reason_count += 1;
                    last_chunk_index = Some(idx);

--- a/lib/llm/tests/test_streaming_usage.rs
+++ b/lib/llm/tests/test_streaming_usage.rs
@@ -241,12 +241,12 @@ async fn test_streaming_without_usage() {
    for (i, chunk) in content_chunks.iter().enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Chunk {} should have usage: None when stream_options not set",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Chunk {} should have choices",
                i
            );
@@ -286,12 +286,12 @@ async fn test_streaming_with_usage_compliance() {
    for (i, chunk) in chunks.iter().take(3).enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Content chunk {} should have usage: None",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Content chunk {} should have choices",
                i
            );
@@ -301,15 +301,15 @@ async fn test_streaming_with_usage_compliance() {
    // Verify the final chunk is the usage-only chunk
    if let Some(final_response) = &chunks[3].data {
        assert!(
-            final_response.choices.is_empty(),
+            final_response.inner.choices.is_empty(),
            "Final usage chunk should have empty choices array"
        );
        assert!(
-            final_response.usage.is_some(),
+            final_response.inner.usage.is_some(),
            "Final usage chunk should have usage statistics"
        );

-        let usage = final_response.usage.as_ref().unwrap();
+        let usage = final_response.inner.usage.as_ref().unwrap();
        assert_eq!(
            usage.completion_tokens, 3,
            "Should have 3 completion tokens"
@@ -359,18 +359,18 @@ async fn test_streaming_with_continuous_usage() {
    for (i, chunk) in chunks.iter().take(3).enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_some(),
+                response.inner.usage.is_some(),
                "Content chunk {} should have usage: Some",
                i
            );
            assert!(
-                !response.choices.is_empty(),
+                !response.inner.choices.is_empty(),
                "Content chunk {} should have choices",
                i
            );

            // Verify usage counts are properly accumulated for each chunk
-            let usage = response.usage.as_ref().unwrap();
+            let usage = response.inner.usage.as_ref().unwrap();
            assert_eq!(
                usage.completion_tokens,
                i as u32 + 1,
@@ -392,15 +392,15 @@ async fn test_streaming_with_continuous_usage() {
    // Verify the final chunk is the usage-only chunk
    if let Some(final_response) = &chunks[3].data {
        assert!(
-            final_response.choices.is_empty(),
+            final_response.inner.choices.is_empty(),
            "Final usage chunk should have empty choices array"
        );
        assert!(
-            final_response.usage.is_some(),
+            final_response.inner.usage.is_some(),
            "Final usage chunk should have usage statistics"
        );

-        let usage = final_response.usage.as_ref().unwrap();
+        let usage = final_response.inner.usage.as_ref().unwrap();
        assert_eq!(
            usage.completion_tokens, 3,
            "Should have 3 completion tokens"
@@ -464,7 +464,7 @@ async fn test_streaming_with_usage_false() {
    for (i, chunk) in content_chunks.iter().enumerate() {
        if let Some(response) = &chunk.data {
            assert!(
-                response.usage.is_none(),
+                response.inner.usage.is_none(),
                "Chunk {} should have usage: None when include_usage is false",
                i
            );
@@ -560,7 +560,7 @@ async fn test_nonstreaming_has_usage_field() {

    // Aggregate the streaming chunks into a single non-streaming response
    // This simulates what the HTTP service does for non-streaming requests
-    let result = dynamo_async_openai::types::CreateChatCompletionResponse::from_annotated_stream(
+    let result = dynamo_llm::protocols::openai::chat_completions::NvCreateChatCompletionResponse::from_annotated_stream(
        transformed_stream,
        ParsingOptions::default(),
    )
@@ -570,12 +570,12 @@ async fn test_nonstreaming_has_usage_field() {
    let response = result.unwrap();

    assert!(
-        response.usage.is_some(),
+        response.inner.usage.is_some(),
        "Non-streaming chat completion response MUST have a usage field populated. \
         This is required for OpenAI API compliance."
    );

-    let usage = response.usage.unwrap();
+    let usage = response.inner.usage.unwrap();

    // Verify usage contains valid token counts
    // In our mock, we generated 3 tokens (from the 3 backend outputs)
@@ -725,7 +725,11 @@ async fn test_chat_streaming_with_cached_tokens_propagation() {

    assert_eq!(chunks.len(), 4, "Should have 3 content + 1 usage chunk");
    if let Some(final_resp) = &chunks[3].data {
-        let usage = final_resp.usage.as_ref().expect("Usage must be present");
+        let usage = final_resp
+            .inner
+            .usage
+            .as_ref()
+            .expect("Usage must be present");
        let cached = usage
            .prompt_tokens_details
            .as_ref()

--- a/lib/llm/tests/tool_choice.rs
+++ b/lib/llm/tests/tool_choice.rs
@@ -157,7 +157,7 @@ async fn test_named_tool_choice_parses_json() {
        .expect("choice generation");

    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let choice = &response.choices[0];
+    let choice = &response.inner.choices[0];

    assert_eq!(
        choice.finish_reason,
@@ -199,7 +199,7 @@ async fn test_required_tool_choice_parses_json_array() {
        .expect("choice generation");

    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let choice = &response.choices[0];
+    let choice = &response.inner.choices[0];

    assert_eq!(
        choice.finish_reason,
@@ -259,7 +259,7 @@ async fn test_tool_choice_parse_failure_returns_as_content() {
        .expect("choice generation");

    let response = apply_jail_transformation(raw_response, tool_choice).await;
-    let delta = &response.choices[0].delta;
+    let delta = &response.inner.choices[0].delta;

    // Jail stream behavior: if parsing fails, return accumulated content as-is
    // This matches marker-based FC behavior
@@ -317,11 +317,11 @@ async fn test_streaming_named_tool_buffers_until_finish() {

    let response = &all_responses[0];
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Stop)
    );

-    let tool_calls = response.choices[0].delta.tool_calls.as_ref().unwrap();
+    let tool_calls = response.inner.choices[0].delta.tool_calls.as_ref().unwrap();
    assert_eq!(tool_calls.len(), 1);
    assert_eq!(
        tool_calls[0].function.as_ref().unwrap().name.as_deref(),
@@ -384,11 +384,11 @@ async fn test_streaming_required_tool_parallel() {

    let response = &all_responses[0];
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ToolCalls)
    );

-    let tool_calls = response.choices[0].delta.tool_calls.as_ref().unwrap();
+    let tool_calls = response.inner.choices[0].delta.tool_calls.as_ref().unwrap();
    assert_eq!(tool_calls.len(), 2);

    assert_eq!(
@@ -445,8 +445,12 @@ fn test_no_tool_choice_outputs_normal_text() {
        .expect("normal text");

    assert_eq!(
-        response.choices[0].delta.content.as_ref().map(get_text),
+        response.inner.choices[0]
+            .delta
+            .content
+            .as_ref()
+            .map(get_text),
        Some("Hello world")
    );
-    assert!(response.choices[0].delta.tool_calls.is_none());
+    assert!(response.inner.choices[0].delta.tool_calls.is_none());
 }
--- a/lib/llm/tests/tool_choice_finish_reasons.rs
+++ b/lib/llm/tests/tool_choice_finish_reasons.rs
@@ -116,7 +116,7 @@ async fn test_named_tool_choice_preserves_length_finish_reason() {

    // Critical: Length finish reason should be preserved, NOT replaced with Stop
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Length),
        "Length finish reason must be preserved for tool_choice=named"
    );
@@ -139,7 +139,7 @@ fn test_required_tool_choice_preserves_length_finish_reason() {

    // Critical: Length finish reason should be preserved, NOT replaced with ToolCalls
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Length),
        "Length finish reason must be preserved for tool_choice=required"
    );
@@ -169,7 +169,7 @@ fn test_named_tool_choice_preserves_content_filter() {

    // Critical: ContentFilter finish reason should be preserved
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ContentFilter),
        "ContentFilter finish reason must be preserved for tool_choice=named"
    );
@@ -192,7 +192,7 @@ fn test_required_tool_choice_preserves_content_filter() {

    // Critical: ContentFilter finish reason should be preserved
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ContentFilter),
        "ContentFilter finish reason must be preserved for tool_choice=required"
    );
@@ -222,7 +222,7 @@ fn test_named_tool_choice_normal_stop_becomes_stop() {

    // Normal completion: Stop should remain Stop for named tool choice
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::Stop),
    );
 }
@@ -247,7 +247,7 @@ async fn test_required_tool_choice_normal_stop_becomes_tool_calls() {

    // Normal completion: Stop should become ToolCalls for required tool choice
    assert_eq!(
-        response.choices[0].finish_reason,
+        response.inner.choices[0].finish_reason,
        Some(dynamo_async_openai::types::FinishReason::ToolCalls),
    );
 }