chore: remove flatten for chat response types, add reasoning_content (#2543)

Changing the chat completions response objects from structs to types of dynamo_async_openai Implement aggregator traits for them chat completion structs add reasoning_content under message and delta message in lib/async-openai

chore: remove flatten for chat response types, add reasoning_content (#2543)
Changing the chat completions response objects from structs to types of dynamo_async_openai Implement aggregator traits for them chat completion structs add reasoning_content under message and delta message in lib/async-openai
c12fe501 · nachiketb-nvidia · GitHub · a0ddcbce · c12fe501 · c12fe501
Unverified Commit c12fe501 authored Aug 19, 2025 by nachiketb-nvidia Committed by GitHub Aug 20, 2025
15 changed files
--- a/.github/workflows/docs-link-check.yml
+++ b/.github/workflows/docs-link-check.yml
@@ -50,6 +50,7 @@ jobs:
          # Set GITHUB_TOKEN to avoid github rate limits on URL checks
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
+          cd docs
          set -euo pipefail
          # Run lychee against all files in repo
          lychee \

--- a/lib/async-openai/src/types/chat.rs
+++ b/lib/async-openai/src/types/chat.rs
@@ -449,6 +449,9 @@ pub struct ChatCompletionResponseMessage {
    /// If the audio output modality is requested, this object contains data about the audio response from the model. [Learn more](https://platform.openai.com/docs/guides/audio).
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio: Option<ChatCompletionResponseMessageAudio>,
+
+    /// NVIDIA-specific extensions for the chat completion response.
+    pub reasoning_content: Option<String>,
 }

 #[derive(Clone, Serialize, Default, Debug, Deserialize, Builder, PartialEq)]
@@ -1021,6 +1024,9 @@ pub struct ChatCompletionStreamResponseDelta {
    pub role: Option<Role>,
    /// The refusal message generated by the model.
    pub refusal: Option<String>,
+
+    /// NVIDIA-specific extensions for the chat completion response.
+    pub reasoning_content: Option<String>,
 }

 #[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]

--- a/lib/engines/mistralrs/src/lib.rs
+++ b/lib/engines/mistralrs/src/lib.rs
@@ -396,7 +396,7 @@ impl
                        //tracing::trace!("from_assistant: {from_assistant}");

                        #[allow(deprecated)]
-                        let inner = dynamo_async_openai::types::CreateChatCompletionStreamResponse{
+                        let delta = NvCreateChatCompletionStreamResponse {
                            id: c.id,
                            choices: vec![dynamo_async_openai::types::ChatChoiceStream{
                                index: 0,
@@ -407,6 +407,7 @@ impl
                                    tool_calls: None,
                                    refusal: None,
                                    function_call: None,
+                                    reasoning_content: None,
                                },
                                logprobs: None,
                                finish_reason,
@@ -418,7 +419,6 @@ impl
                            system_fingerprint: Some(c.system_fingerprint),
                            service_tier: None,
                        };
-                        let delta = NvCreateChatCompletionStreamResponse{inner};
                        let ann = Annotated{
                            id: None,
                            data: Some(delta),

--- a/lib/llm/src/engines.rs
+++ b/lib/llm/src/engines.rs
@@ -204,18 +204,12 @@ impl
            for c in prompt.chars() {
                // we are returning characters not tokens, so there will be some postprocessing overhead
                tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                let inner = deltas.create_choice(0, Some(c.to_string()), None, None);
-                let response = NvCreateChatCompletionStreamResponse {
-                    inner,
-                };
+                let response = deltas.create_choice(0, Some(c.to_string()), None, None);
                yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
                id += 1;
            }

-            let inner = deltas.create_choice(0, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
-            let response = NvCreateChatCompletionStreamResponse {
-                inner,
-            };
+            let response = deltas.create_choice(0, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
            yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
        };


--- a/lib/llm/src/entrypoint/input/batch.rs
+++ b/lib/llm/src/entrypoint/input/batch.rs
@@ -233,7 +233,7 @@ async fn evaluate(
        match (item.data.as_ref(), item.event.as_deref()) {
            (Some(data), _) => {
                // Normal case
-                let choice = data.inner.choices.first();
+                let choice = data.choices.first();
                let chat_comp = choice.as_ref().unwrap();
                if let Some(c) = &chat_comp.delta.content {
                    output += c;

--- a/lib/llm/src/entrypoint/input/text.rs
+++ b/lib/llm/src/entrypoint/input/text.rs
@@ -143,7 +143,7 @@ async fn main_loop(
            match (item.data.as_ref(), item.event.as_deref()) {
                (Some(data), _) => {
                    // Normal case
-                    let entry = data.inner.choices.first();
+                    let entry = data.choices.first();
                    let chat_comp = entry.as_ref().unwrap();
                    if let Some(c) = &chat_comp.delta.content {
                        let _ = stdout.write(c.as_bytes());

--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -31,6 +31,7 @@ use super::{
    service_v2, RouteDoc,
 };
 use crate::preprocessor::LLMMetricAnnotation;
+use crate::protocols::openai::chat_completions::aggregator::ChatCompletionAggregator;
 use crate::protocols::openai::{
    chat_completions::{NvCreateChatCompletionRequest, NvCreateChatCompletionResponse},
    completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},

--- a/lib/llm/src/perf/logprobs.rs
+++ b/lib/llm/src/perf/logprobs.rs
@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
    fn extract_logprobs_by_choice(&self) -> HashMap<u32, Vec<TokenLogProbs>> {
        let mut result = HashMap::new();

-        for choice in &self.inner.choices {
+        for choice in &self.choices {
            let choice_index = choice.index;

            let choice_logprobs = choice
@@ -574,8 +574,7 @@ mod tests {
    use approx::assert_abs_diff_eq;
    use dynamo_async_openai::types::{
        ChatChoiceLogprobs, ChatChoiceStream, ChatCompletionStreamResponseDelta,
-        ChatCompletionTokenLogprob, CreateChatCompletionStreamResponse, FinishReason, Role,
-        TopLogprobs,
+        ChatCompletionTokenLogprob, FinishReason, Role, TopLogprobs,
    };
    use futures::StreamExt;
    use std::sync::Arc;
@@ -949,7 +948,7 @@ mod tests {
        token_logprobs: Vec<ChatCompletionTokenLogprob>,
    ) -> NvCreateChatCompletionStreamResponse {
        #[expect(deprecated)]
-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices: vec![ChatChoiceStream {
                index: 0,
@@ -959,6 +958,7 @@ mod tests {
                    tool_calls: None,
                    role: Some(Role::Assistant),
                    refusal: None,
+                    reasoning_content: None,
                },
                finish_reason: Some(FinishReason::Stop),
                logprobs: Some(ChatChoiceLogprobs {
@@ -972,9 +972,7 @@ mod tests {
            system_fingerprint: None,
            object: "chat.completion.chunk".to_string(),
            usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
    }

    fn create_mock_response_with_multiple_choices(
@@ -992,6 +990,7 @@ mod tests {
                    tool_calls: None,
                    role: Some(Role::Assistant),
                    refusal: None,
+                    reasoning_content: None,
                },
                finish_reason: Some(FinishReason::Stop),
                logprobs: Some(ChatChoiceLogprobs {
@@ -1001,7 +1000,7 @@ mod tests {
            })
            .collect();

-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices,
            created: 1234567890,
@@ -1010,9 +1009,7 @@ mod tests {
            system_fingerprint: None,
            object: "chat.completion.chunk".to_string(),
            usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
    }

    #[test]
@@ -1331,7 +1328,7 @@ mod tests {
    fn test_logprob_extractor_with_missing_data() {
        // Test with choice that has no logprobs
        #[expect(deprecated)]
-        let inner = CreateChatCompletionStreamResponse {
+        let response = NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices: vec![ChatChoiceStream {
                index: 0,
@@ -1341,6 +1338,7 @@ mod tests {
                    tool_calls: None,
                    role: Some(Role::Assistant),
                    refusal: None,
+                    reasoning_content: None,
                },
                finish_reason: Some(FinishReason::Stop),
                logprobs: None, // No logprobs
@@ -1353,7 +1351,6 @@ mod tests {
            usage: None,
        };

-        let response = NvCreateChatCompletionStreamResponse { inner };
        let logprobs = response.extract_logprobs_by_choice();
        assert_eq!(logprobs.len(), 1);
        assert!(logprobs.values().any(|v| v.is_empty()));
@@ -1556,9 +1553,8 @@ mod tests {
    fn create_mock_response() -> NvCreateChatCompletionStreamResponse {
        // Create a mock response for testing
        // In practice, this would have real logprobs data
-        use dynamo_async_openai::types::CreateChatCompletionStreamResponse;

-        let inner = CreateChatCompletionStreamResponse {
+        NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            choices: vec![],
            created: 1234567890,
@@ -1567,9 +1563,7 @@ mod tests {
            system_fingerprint: None,
            object: "chat.completion.chunk".to_string(),
            usage: None,
-        };
-
-        NvCreateChatCompletionStreamResponse { inner }
+        }
    }

    // Mock context for testing

--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -27,7 +27,7 @@ use super::{
    OpenAIStopConditionsProvider,
 };

-mod aggregator;
+pub mod aggregator;
 mod delta;

 pub use aggregator::DeltaAggregator;
@@ -59,11 +59,7 @@ pub struct NvCreateChatCompletionRequest {
 /// # Fields
 /// - `inner`: The base OpenAI unary chat completion response, embedded
 ///   using `serde(flatten)`.
-#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
-pub struct NvCreateChatCompletionResponse {
-    #[serde(flatten)]
-    pub inner: dynamo_async_openai::types::CreateChatCompletionResponse,
-}
+pub type NvCreateChatCompletionResponse = dynamo_async_openai::types::CreateChatCompletionResponse;

 /// A response structure for streamed chat completions, embedding OpenAI's
 /// `CreateChatCompletionStreamResponse`.
@@ -71,11 +67,8 @@ pub struct NvCreateChatCompletionResponse {
 /// # Fields
 /// - `inner`: The base OpenAI streaming chat completion response, embedded
 ///   using `serde(flatten)`.
-#[derive(Serialize, Deserialize, Validate, Debug, Clone)]
-pub struct NvCreateChatCompletionStreamResponse {
-    #[serde(flatten)]
-    pub inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse,
-}
+pub type NvCreateChatCompletionStreamResponse =
+    dynamo_async_openai::types::CreateChatCompletionStreamResponse;

 /// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
 /// providing access to NVIDIA-specific extensions.

--- a/lib/llm/src/protocols/openai/chat_completions/aggregator.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/aggregator.rs
@@ -110,21 +110,21 @@ impl DeltaAggregator {
                if aggregator.error.is_none() && delta.data.is_some() {
                    // Extract the data payload from the delta.
                    let delta = delta.data.unwrap();
-                    aggregator.id = delta.inner.id;
-                    aggregator.model = delta.inner.model;
-                    aggregator.created = delta.inner.created;
-                    aggregator.service_tier = delta.inner.service_tier;
+                    aggregator.id = delta.id;
+                    aggregator.model = delta.model;
+                    aggregator.created = delta.created;
+                    aggregator.service_tier = delta.service_tier;

                    // Aggregate usage statistics if available.
-                    if let Some(usage) = delta.inner.usage {
+                    if let Some(usage) = delta.usage {
                        aggregator.usage = Some(usage);
                    }
-                    if let Some(system_fingerprint) = delta.inner.system_fingerprint {
+                    if let Some(system_fingerprint) = delta.system_fingerprint {
                        aggregator.system_fingerprint = Some(system_fingerprint);
                    }

                    // Aggregate choices incrementally.
-                    for choice in delta.inner.choices {
+                    for choice in delta.choices {
                        let state_choice =
                            aggregator
                                .choices
@@ -198,7 +198,7 @@ impl DeltaAggregator {
        choices.sort_by(|a, b| a.index.cmp(&b.index));

        // Construct the final response object.
-        let inner = dynamo_async_openai::types::CreateChatCompletionResponse {
+        let response = NvCreateChatCompletionResponse {
            id: aggregator.id,
            created: aggregator.created,
            usage: aggregator.usage,
@@ -209,8 +209,6 @@ impl DeltaAggregator {
            service_tier: aggregator.service_tier,
        };

-        let response = NvCreateChatCompletionResponse { inner };
-
        Ok(response)
    }
 }
@@ -234,6 +232,7 @@ impl From<DeltaChoice> for dynamo_async_openai::types::ChatChoice {
                refusal: None,
                function_call: None,
                audio: None,
+                reasoning_content: None,
            },
            index: delta.index,
            finish_reason: delta.finish_reason,
@@ -242,35 +241,48 @@ impl From<DeltaChoice> for dynamo_async_openai::types::ChatChoice {
    }
 }

-impl NvCreateChatCompletionResponse {
-    /// Converts an SSE stream into a [`NvCreateChatCompletionResponse`].
+/// Trait for aggregating chat completion responses from streams.
+/// Setting this macro because our async functions are not used outside of the library
+#[allow(async_fn_in_trait)]
+pub trait ChatCompletionAggregator {
+    /// Aggregates an annotated stream of chat completion responses into a final response.
    ///
    /// # Arguments
-    /// * `stream` - A stream of SSE messages containing chat completion responses.
+    /// * `stream` - A stream of annotated chat completion responses.
    ///
    /// # Returns
    /// * `Ok(NvCreateChatCompletionResponse)` if aggregation succeeds.
    /// * `Err(String)` if an error occurs.
-    pub async fn from_sse_stream(
-        stream: DataStream<Result<Message, SseCodecError>>,
-    ) -> Result<NvCreateChatCompletionResponse, String> {
-        let stream = convert_sse_stream::<NvCreateChatCompletionStreamResponse>(stream);
-        NvCreateChatCompletionResponse::from_annotated_stream(stream).await
-    }
+    async fn from_annotated_stream(
+        stream: impl Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>>,
+    ) -> Result<NvCreateChatCompletionResponse, String>;

-    /// Aggregates an annotated stream of chat completion responses into a final response.
+    /// Converts an SSE stream into a [`NvCreateChatCompletionResponse`].
    ///
    /// # Arguments
-    /// * `stream` - A stream of annotated chat completion responses.
+    /// * `stream` - A stream of SSE messages containing chat completion responses.
    ///
    /// # Returns
    /// * `Ok(NvCreateChatCompletionResponse)` if aggregation succeeds.
    /// * `Err(String)` if an error occurs.
-    pub async fn from_annotated_stream(
+    async fn from_sse_stream(
+        stream: DataStream<Result<Message, SseCodecError>>,
+    ) -> Result<NvCreateChatCompletionResponse, String>;
+}
+
+impl ChatCompletionAggregator for dynamo_async_openai::types::CreateChatCompletionResponse {
+    async fn from_annotated_stream(
        stream: impl Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>>,
    ) -> Result<NvCreateChatCompletionResponse, String> {
        DeltaAggregator::apply(stream).await
    }
+
+    async fn from_sse_stream(
+        stream: DataStream<Result<Message, SseCodecError>>,
+    ) -> Result<NvCreateChatCompletionResponse, String> {
+        let stream = convert_sse_stream::<NvCreateChatCompletionStreamResponse>(stream);
+        NvCreateChatCompletionResponse::from_annotated_stream(stream).await
+    }
 }

 #[cfg(test)]
@@ -293,6 +305,7 @@ mod tests {
            tool_calls: None,
            role,
            refusal: None,
+            reasoning_content: None,
        };
        let choice = dynamo_async_openai::types::ChatChoiceStream {
            index,
@@ -301,7 +314,7 @@ mod tests {
            logprobs: None,
        };

-        let inner = dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+        let data = NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            model: "meta/llama-3.1-8b-instruct".to_string(),
            created: 1234567890,
@@ -312,8 +325,6 @@ mod tests {
            object: "chat.completion".to_string(),
        };

-        let data = NvCreateChatCompletionStreamResponse { inner };
-
        Annotated {
            data: Some(data),
            id: Some("test_id".to_string()),
@@ -336,13 +347,13 @@ mod tests {
        let response = result.unwrap();

        // Verify that the response is empty and has default values
-        assert_eq!(response.inner.id, "");
-        assert_eq!(response.inner.model, "");
-        assert_eq!(response.inner.created, 0);
-        assert!(response.inner.usage.is_none());
-        assert!(response.inner.system_fingerprint.is_none());
-        assert_eq!(response.inner.choices.len(), 0);
-        assert!(response.inner.service_tier.is_none());
+        assert_eq!(response.id, "");
+        assert_eq!(response.model, "");
+        assert_eq!(response.created, 0);
+        assert!(response.usage.is_none());
+        assert!(response.system_fingerprint.is_none());
+        assert_eq!(response.choices.len(), 0);
+        assert!(response.service_tier.is_none());
    }

    #[tokio::test]
@@ -366,18 +377,18 @@ mod tests {
        let response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.inner.id, "test_id");
-        assert_eq!(response.inner.model, "meta/llama-3.1-8b-instruct");
-        assert_eq!(response.inner.created, 1234567890);
-        assert!(response.inner.usage.is_none());
-        assert!(response.inner.system_fingerprint.is_none());
-        assert_eq!(response.inner.choices.len(), 1);
-        let choice = &response.inner.choices[0];
+        assert_eq!(response.id, "test_id");
+        assert_eq!(response.model, "meta/llama-3.1-8b-instruct");
+        assert_eq!(response.created, 1234567890);
+        assert!(response.usage.is_none());
+        assert!(response.system_fingerprint.is_none());
+        assert_eq!(response.choices.len(), 1);
+        let choice = &response.choices[0];
        assert_eq!(choice.index, 0);
        assert_eq!(choice.message.content.as_ref().unwrap(), "Hello,");
        assert!(choice.finish_reason.is_none());
        assert_eq!(choice.message.role, dynamo_async_openai::types::Role::User);
-        assert!(response.inner.service_tier.is_none());
+        assert!(response.service_tier.is_none());
    }

    #[tokio::test]
@@ -410,8 +421,8 @@ mod tests {
        let response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.inner.choices.len(), 1);
-        let choice = &response.inner.choices[0];
+        assert_eq!(response.choices.len(), 1);
+        let choice = &response.choices[0];
        assert_eq!(choice.index, 0);
        assert_eq!(choice.message.content.as_ref().unwrap(), "Hello, world!");
        assert_eq!(
@@ -426,7 +437,7 @@ mod tests {
    async fn test_multiple_choices() {
        // Create a delta with multiple choices
        // ALLOW: function_call is deprecated
-        let delta = dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+        let data = NvCreateChatCompletionStreamResponse {
            id: "test_id".to_string(),
            model: "test_model".to_string(),
            created: 1234567890,
@@ -442,6 +453,7 @@ mod tests {
                        function_call: None,
                        tool_calls: None,
                        refusal: None,
+                        reasoning_content: None,
                    },
                    finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
                    logprobs: None,
@@ -454,6 +466,7 @@ mod tests {
                        function_call: None,
                        tool_calls: None,
                        refusal: None,
+                        reasoning_content: None,
                    },
                    finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
                    logprobs: None,
@@ -462,8 +475,6 @@ mod tests {
            object: "chat.completion".to_string(),
        };

-        let data = NvCreateChatCompletionStreamResponse { inner: delta };
-
        // Wrap it in Annotated and create a stream
        let annotated_delta = Annotated {
            data: Some(data),
@@ -481,9 +492,9 @@ mod tests {
        let mut response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.inner.choices.len(), 2);
-        response.inner.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
-        let choice0 = &response.inner.choices[0];
+        assert_eq!(response.choices.len(), 2);
+        response.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
+        let choice0 = &response.choices[0];
        assert_eq!(choice0.index, 0);
        assert_eq!(choice0.message.content.as_ref().unwrap(), "Choice 0");
        assert_eq!(
@@ -495,7 +506,7 @@ mod tests {
            dynamo_async_openai::types::Role::Assistant
        );

-        let choice1 = &response.inner.choices[1];
+        let choice1 = &response.choices[1];
        assert_eq!(choice1.index, 1);
        assert_eq!(choice1.message.content.as_ref().unwrap(), "Choice 1");
        assert_eq!(
@@ -520,9 +531,7 @@ mod tests {
            Some(dynamo_async_openai::types::Role::Assistant),
            Some(dynamo_async_openai::types::FinishReason::ToolCalls),
        );
-        let delta = annotated_delta.data.unwrap().inner;
-
-        let data = NvCreateChatCompletionStreamResponse { inner: delta };
+        let data = annotated_delta.data.unwrap();

        // Wrap it in Annotated and create a stream
        let annotated_delta = Annotated {
@@ -541,8 +550,8 @@ mod tests {
        let response = result.unwrap();

        // There should be one choice
-        assert_eq!(response.inner.choices.len(), 1);
-        let choice = &response.inner.choices[0];
+        assert_eq!(response.choices.len(), 1);
+        let choice = &response.choices[0];

        // The tool_calls field should be present and parsed
        assert!(choice.message.tool_calls.is_some());

--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -209,6 +209,7 @@ impl DeltaGenerator {
                None
            },
            refusal: None,
+            reasoning_content: None,
        };

        let choice = dynamo_async_openai::types::ChatChoiceStream {
@@ -304,9 +305,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
        let index = 0;
        let stream_response = self.create_choice(index, delta.text, finish_reason, logprobs);

-        Ok(NvCreateChatCompletionStreamResponse {
-            inner: stream_response,
-        })
+        Ok(stream_response)
    }

    fn get_isl(&self) -> Option<u32> {

--- a/lib/llm/src/protocols/openai/responses.rs
+++ b/lib/llm/src/protocols/openai/responses.rs
@@ -199,7 +199,7 @@ impl TryFrom<NvCreateChatCompletionResponse> for NvResponse {
    type Error = anyhow::Error;

    fn try_from(nv_resp: NvCreateChatCompletionResponse) -> Result<Self, Self::Error> {
-        let chat_resp = nv_resp.inner;
+        let chat_resp = nv_resp;
        let content_text = chat_resp
            .choices
            .into_iter()
@@ -341,28 +341,27 @@ mod tests {
    fn test_into_nvresponse_from_chat_response() {
        let now = 1_726_000_000;
        let chat_resp = NvCreateChatCompletionResponse {
-            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
-                id: "chatcmpl-xyz".into(),
-                choices: vec![dynamo_async_openai::types::ChatChoice {
-                    index: 0,
-                    message: dynamo_async_openai::types::ChatCompletionResponseMessage {
-                        content: Some("This is a reply".into()),
-                        refusal: None,
-                        tool_calls: None,
-                        role: dynamo_async_openai::types::Role::Assistant,
-                        function_call: None,
-                        audio: None,
-                    },
-                    finish_reason: None,
-                    logprobs: None,
-                }],
-                created: now,
-                model: "llama-3.1-8b-instruct".into(),
-                service_tier: None,
-                system_fingerprint: None,
-                object: "chat.completion".to_string(),
-                usage: None,
-            },
+            id: "chatcmpl-xyz".into(),
+            choices: vec![dynamo_async_openai::types::ChatChoice {
+                index: 0,
+                message: dynamo_async_openai::types::ChatCompletionResponseMessage {
+                    content: Some("This is a reply".into()),
+                    refusal: None,
+                    tool_calls: None,
+                    role: dynamo_async_openai::types::Role::Assistant,
+                    function_call: None,
+                    audio: None,
+                    reasoning_content: None,
+                },
+                finish_reason: None,
+                logprobs: None,
+            }],
+            created: now,
+            model: "llama-3.1-8b-instruct".into(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion".to_string(),
+            usage: None,
        };

        let wrapped: NvResponse = chat_resp.try_into().unwrap();

--- a/lib/llm/tests/aggregators.rs
+++ b/lib/llm/tests/aggregators.rs
@@ -16,7 +16,8 @@
 use dynamo_llm::protocols::{
    codec::{create_message_stream, Message, SseCodecError},
    openai::{
-        chat_completions::NvCreateChatCompletionResponse, completions::NvCreateCompletionResponse,
+        chat_completions::{aggregator::ChatCompletionAggregator, NvCreateChatCompletionResponse},
+        completions::NvCreateCompletionResponse,
    },
    ContentProvider, DataStream,
 };
@@ -43,7 +44,6 @@ async fn test_openai_chat_stream() {
    // todo: provide a cleaner way to extract the content from choices
    assert_eq!(
        result
-            .inner
            .choices
            .first()
            .unwrap()
@@ -65,7 +65,6 @@ async fn test_openai_chat_edge_case_multi_line_data() {

    assert_eq!(
        result
-            .inner
            .choices
            .first()
            .unwrap()
@@ -86,7 +85,6 @@ async fn test_openai_chat_edge_case_comments_per_response() {

    assert_eq!(
        result
-            .inner
            .choices
            .first()
            .unwrap()

--- a/lib/llm/tests/http-service.rs
+++ b/lib/llm/tests/http-service.rs
@@ -100,11 +100,7 @@ impl
        let stream = stream! {
            tokio::time::sleep(std::time::Duration::from_millis(max_tokens)).await;
            for i in 0..10 {
-                let inner = generator.create_choice(i,Some(format!("choice {i}")), None, None);
-
-                let output = NvCreateChatCompletionStreamResponse {
-                    inner,
-                };
+                let output = generator.create_choice(i,Some(format!("choice {i}")), None, None);

                yield Annotated::from_data(output);
            }

--- a/lib/llm/tests/logprob_analysis_integration.rs
+++ b/lib/llm/tests/logprob_analysis_integration.rs
@@ -12,8 +12,7 @@ use dynamo_llm::protocols::openai::chat_completions::NvCreateChatCompletionStrea

 use dynamo_async_openai::types::{
    ChatChoiceLogprobs, ChatChoiceStream, ChatCompletionStreamResponseDelta,
-    ChatCompletionTokenLogprob, CreateChatCompletionStreamResponse, FinishReason, Role,
-    TopLogprobs,
+    ChatCompletionTokenLogprob, FinishReason, Role, TopLogprobs,
 };

 // Type aliases to simplify complex test data structures
@@ -387,6 +386,7 @@ fn create_response_with_linear_probs(
            tool_calls: None,
            role: Some(Role::Assistant),
            refusal: None,
+            reasoning_content: None,
        },
        finish_reason: Some(FinishReason::Stop),
        logprobs: Some(ChatChoiceLogprobs {
@@ -395,7 +395,7 @@ fn create_response_with_linear_probs(
        }),
    };

-    let inner = CreateChatCompletionStreamResponse {
+    NvCreateChatCompletionStreamResponse {
        id: "test_id".to_string(),
        choices: vec![choice],
        created: 1234567890,
@@ -404,9 +404,7 @@ fn create_response_with_linear_probs(
        system_fingerprint: None,
        object: "chat.completion.chunk".to_string(),
        usage: None,
-    };
-
-    NvCreateChatCompletionStreamResponse { inner }
+    }
 }

 fn create_multi_choice_response(
@@ -466,6 +464,7 @@ fn create_multi_choice_response(
                    tool_calls: None,
                    role: Some(Role::Assistant),
                    refusal: None,
+                    reasoning_content: None,
                },
                finish_reason: Some(FinishReason::Stop),
                logprobs: Some(ChatChoiceLogprobs {
@@ -476,7 +475,7 @@ fn create_multi_choice_response(
        })
        .collect();

-    let inner = CreateChatCompletionStreamResponse {
+    NvCreateChatCompletionStreamResponse {
        id: "test_id".to_string(),
        choices,
        created: 1234567890,
@@ -485,7 +484,5 @@ fn create_multi_choice_response(
        system_fingerprint: None,
        object: "chat.completion.chunk".to_string(),
        usage: None,
-    };
-
-    NvCreateChatCompletionStreamResponse { inner }
+    }
 }