fix(responses): accept assistant output_text messages without id/status in input (#6599)

Signed-off-by: Marko Kosec <mkosec@nvidia.com> Signed-off-by: Matej Kosec <mkosec@nvidia.com> Signed-off-by: Vasilis Vagias <vvagias@nvidia.com> Co-authored-by: vvagias <vasilis.n.vagias@gmail.com> Co-authored-by: ishandhanani <82981111+ishandhanani@users.noreply.github.com>

fix(responses): accept assistant output_text messages without id/status in input (#6599)
Signed-off-by: Marko Kosec <mkosec@nvidia.com> Signed-off-by: Matej Kosec <mkosec@nvidia.com> Signed-off-by: Vasilis Vagias <vvagias@nvidia.com> Co-authored-by: vvagias <vasilis.n.vagias@gmail.com> Co-authored-by: ishandhanani <82981111+ishandhanani@users.noreply.github.com>
9b2b44e3 · MatejKosec · GitHub · abc02c68 · 9b2b44e3 · 9b2b44e3
Unverified Commit 9b2b44e3 authored Mar 06, 2026 by MatejKosec Committed by GitHub Mar 06, 2026
4 changed files
--- a/lib/async-openai/src/types/responses/response.rs
+++ b/lib/async-openai/src/types/responses/response.rs
@@ -24,10 +24,11 @@ pub enum Role {
 }

 /// Status of input/output items.
-#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, ToSchema)]
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Default, ToSchema)]
 #[serde(rename_all = "snake_case")]
 pub enum OutputStatus {
    InProgress,
+    #[default]
    Completed,
    Incomplete,
 }
@@ -367,6 +368,8 @@ pub struct CustomToolCallOutput {
 #[builder(build_fn(error = "OpenAIError"))]
 pub struct EasyInputMessage {
    /// The type of the message input. Always set to `message`.
+    /// Optional in the "easy" format — defaults to `message` when omitted.
+    #[serde(default)]
    pub r#type: MessageType,
    /// The role of the message input. One of `user`, `assistant`, `system`, or `developer`.
    pub role: Role,
@@ -423,6 +426,7 @@ pub enum EasyInputContent {
 }

 /// Parts of a message: text, image, file, or audio.
+/// Also accepts `output_text` for replaying assistant turns in the "easy" input format.
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, ToSchema)]
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum InputContent {
@@ -437,6 +441,11 @@ pub enum InputContent {
    InputVideo(InputVideoContent),
    /// An audio input to the model.
    InputAudio(InputAudioContent),
+    /// An output text content item, accepted when replaying assistant messages
+    /// in the "easy" input format (role: assistant with output_text content).
+    OutputText(OutputTextContent),
+    /// A refusal content item, accepted when replaying assistant messages.
+    Refusal(RefusalContent),
 }

 /// Video content for input messages.
@@ -894,6 +903,7 @@ pub struct ResponseTextParam {
    /// Setting to `{ "type": "json_object" }` enables the older JSON mode, which
    /// ensures the message the model generates is valid JSON. Using `json_schema`
    /// is preferred for models that support it.
+    #[serde(default)]
    pub format: TextResponseFormatConfiguration,

    /// Constrains the verbosity of the model's response. Lower values will result in
@@ -904,10 +914,11 @@ pub struct ResponseTextParam {
    pub verbosity: Option<Verbosity>,
 }

-#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, ToSchema)]
+#[derive(Debug, Default, Deserialize, Serialize, Clone, PartialEq, ToSchema)]
 #[serde(tag = "type", rename_all = "snake_case")]
 pub enum TextResponseFormatConfiguration {
    /// Default response format. Used to generate text responses.
+    #[default]
    Text,
    /// JSON object response format. An older method of generating JSON responses.
    /// Using `json_schema` is recommended for models that support it.
@@ -1473,6 +1484,8 @@ pub struct ResponseLogProb {
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, ToSchema)]
 pub struct OutputTextContent {
    /// The annotations of the text output.
+    /// Defaults to empty when not provided (e.g., replaying assistant turns as input).
+    #[serde(default)]
    pub annotations: Vec<Annotation>,
    pub logprobs: Option<Vec<LogProb>>,
    /// The text output from the model.
@@ -1545,17 +1558,26 @@ pub struct RefusalContent {
 }

 /// A message generated by the model.
+///
+/// `id` and `status` use `#[serde(default)]` so that clients can feed back a
+/// previous assistant message without those fields (e.g. multi-turn
+/// conversations where the caller only has the `output_text` content).
+/// The `MessageItem` enum is `#[serde(untagged)]` and tries `Output` first;
+/// without defaults the missing fields would cause deserialization to fall
+/// through to `Input`, which rejects `role: "assistant"`.
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, ToSchema)]
 pub struct OutputMessage {
    /// The content of the output message.
    pub content: Vec<OutputMessageContent>,
-    /// The unique ID of the output message.
-    pub id: String,
+    /// Optional when provided as input (e.g., replaying assistant turns in conversation history).
+    /// Always present in model-generated output.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
    /// The role of the output message. Always `assistant`.
    pub role: AssistantRole,
-    /// The status of the message input. One of `in_progress`, `completed`, or
-    /// `incomplete`. Populated when input items are returned via API.
-    pub status: OutputStatus,
+    /// Optional when provided as input (e.g., replaying assistant turns in conversation history).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub status: Option<OutputStatus>,
    ///// The type of the output message. Always `message`.
    //pub r#type: MessageType,
 }
@@ -2841,3 +2863,94 @@ pub struct CompactResource {
    /// Token accounting for the compaction pass, including cached, reasoning, and total tokens.
    pub usage: ResponseUsage,
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Issue #6: Assistant messages with output_text content should deserialize
+    /// without requiring `id` and `status` fields. Clients replay previous
+    /// assistant turns in conversation history without output metadata.
+    #[test]
+    fn test_assistant_output_text_without_id_status() {
+        let json = r#"{
+            "role": "assistant",
+            "content": [{"type": "output_text", "text": "Hello!"}],
+            "type": "message"
+        }"#;
+        let item: InputItem = serde_json::from_str(json)
+            .expect("assistant output_text without id/status should deserialize");
+        match &item {
+            InputItem::Item(Item::Message(MessageItem::Output(out_msg))) => {
+                assert!(out_msg.id.is_none());
+                assert!(out_msg.status.is_none());
+                assert_eq!(out_msg.content.len(), 1);
+            }
+            other => panic!("expected OutputMessage, got {:?}", other),
+        }
+    }
+
+    /// Issue #6 extended: full multi-turn conversation with output_text history.
+    #[test]
+    fn test_multiturn_with_output_text_history() {
+        let json = r#"{
+            "model": "test-model",
+            "input": [
+                {"role": "user", "content": "hi", "type": "message"},
+                {
+                    "role": "assistant",
+                    "content": [{"type": "output_text", "text": "Hello!"}],
+                    "type": "message"
+                },
+                {"role": "user", "content": "bye", "type": "message"}
+            ],
+            "stream": false
+        }"#;
+        let request: CreateResponse = serde_json::from_str(json)
+            .expect("multi-turn with output_text history should deserialize");
+        match &request.input {
+            InputParam::Items(items) => assert_eq!(items.len(), 3),
+            other => panic!("expected Items, got {:?}", other),
+        }
+    }
+
+    /// Issue #7: Reasoning items in the input array should deserialize.
+    #[test]
+    fn test_reasoning_item_in_input() {
+        let json = r#"{
+            "type": "reasoning",
+            "id": "rs_1",
+            "summary": [{"text": "thinking", "type": "summary_text"}]
+        }"#;
+        let item: InputItem =
+            serde_json::from_str(json).expect("reasoning item should deserialize");
+        match &item {
+            InputItem::Item(Item::Reasoning(r)) => {
+                assert_eq!(r.id, "rs_1");
+                assert_eq!(r.summary.len(), 1);
+            }
+            other => panic!("expected Reasoning item, got {:?}", other),
+        }
+    }
+
+    /// OutputMessage with id and status should still work (backwards compat).
+    #[test]
+    fn test_output_message_with_id_and_status() {
+        let json = r#"{
+            "role": "assistant",
+            "id": "msg_abc123",
+            "status": "completed",
+            "content": [{"type": "output_text", "text": "Hello!"}],
+            "type": "message"
+        }"#;
+        let item: InputItem = serde_json::from_str(json)
+            .expect("output message with id/status should still deserialize");
+        match &item {
+            InputItem::Item(Item::Message(MessageItem::Output(out_msg))) => {
+                assert_eq!(out_msg.id.as_deref(), Some("msg_abc123"));
+                assert_eq!(out_msg.status, Some(OutputStatus::Completed));
+            }
+            other => panic!("expected OutputMessage, got {:?}", other),
+        }
+    }
+}
--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -1340,6 +1340,7 @@ async fn responses(
    // Extract request parameters before into_parts() consumes the request.
    // These are echoed back in the Response object per the OpenAI spec.
    let response_params = ResponseParams {
+        model: request.inner.model.clone(),
        temperature: request.inner.temperature,
        top_p: request.inner.top_p,
        max_output_tokens: request.inner.max_output_tokens,
@@ -1347,6 +1348,11 @@ async fn responses(
        tools: request.inner.tools.clone(),
        tool_choice: request.inner.tool_choice.clone(),
        instructions: request.inner.instructions.clone(),
+        reasoning: request.inner.reasoning.clone(),
+        text: request.inner.text.clone(),
+        service_tier: request.inner.service_tier,
+        include: request.inner.include.clone(),
+        truncation: request.inner.truncation,
    };
    let request_id = request.id().to_string();
    let (orig_request, context) = request.into_parts();
@@ -1367,11 +1373,14 @@ async fn responses(
            err_response
        })?;

-    // For non-streaming responses, we still use internal streaming for aggregation,
-    // but we set the chat completion stream flag appropriately.
-    if !streaming {
-        chat_request.inner.stream = Some(true); // Internal streaming for aggregation
-    }
+    // Always use internal streaming for aggregation.
+    // Set stream_options.include_usage so the backend sends token counts in the final chunk.
+    chat_request.inner.stream = Some(true);
+    chat_request.inner.stream_options =
+        Some(dynamo_async_openai::types::ChatCompletionStreamOptions {
+            include_usage: true,
+            continuous_usage_stats: false,
+        });

    let request = context.map(|mut _req| chat_request);

@@ -1556,11 +1565,6 @@ pub fn validate_response_unsupported_fields(
            VALIDATION_PREFIX.to_string() + "`background: true` is not supported.",
        ));
    }
-    if inner.include.is_some() {
-        return Some(ErrorMessage::not_implemented_error(
-            VALIDATION_PREFIX.to_string() + "`include` is not supported.",
-        ));
-    }
    if inner.previous_response_id.is_some() {
        return Some(ErrorMessage::not_implemented_error(
            VALIDATION_PREFIX.to_string() + "`previous_response_id` is not supported.",
@@ -1571,31 +1575,11 @@ pub fn validate_response_unsupported_fields(
            VALIDATION_PREFIX.to_string() + "`prompt` is not supported.",
        ));
    }
-    if inner.reasoning.is_some() {
-        return Some(ErrorMessage::not_implemented_error(
-            VALIDATION_PREFIX.to_string() + "`reasoning` is not supported.",
-        ));
-    }
-    if inner.service_tier.is_some() {
-        return Some(ErrorMessage::not_implemented_error(
-            VALIDATION_PREFIX.to_string() + "`service_tier` is not supported.",
-        ));
-    }
    if inner.store == Some(true) {
        return Some(ErrorMessage::not_implemented_error(
            VALIDATION_PREFIX.to_string() + "`store: true` is not supported.",
        ));
    }
-    if inner.text.is_some() {
-        return Some(ErrorMessage::not_implemented_error(
-            VALIDATION_PREFIX.to_string() + "`text` is not supported.",
-        ));
-    }
-    if inner.truncation.is_some() {
-        return Some(ErrorMessage::not_implemented_error(
-            VALIDATION_PREFIX.to_string() + "`truncation` is not supported.",
-        ));
-    }
    None
 }

@@ -2063,10 +2047,7 @@ mod tests {
    use crate::protocols::openai::common_ext::CommonExt;
    use crate::protocols::openai::completions::NvCreateCompletionRequest;
    use crate::protocols::openai::responses::NvCreateResponse;
-    use dynamo_async_openai::types::responses::{
-        CreateResponse, IncludeEnum, Input, PromptConfig, ServiceTier, TextConfig,
-        TextResponseFormat, Truncation,
-    };
+    use dynamo_async_openai::types::responses::{CreateResponse, Input, PromptConfig};
    use dynamo_async_openai::types::{
        ChatCompletionRequestMessage, ChatCompletionRequestUserMessage,
        ChatCompletionRequestUserMessageContent, CreateChatCompletionRequest,
@@ -2174,10 +2155,6 @@ mod tests {
        #[allow(clippy::type_complexity)]
        let unsupported_cases: Vec<(&str, Box<dyn FnOnce(&mut CreateResponse)>)> = vec![
            ("background", Box::new(|r| r.background = Some(true))),
-            (
-                "include",
-                Box::new(|r| r.include = Some(vec![IncludeEnum::FileSearchCallResults])),
-            ),
            (
                "previous_response_id",
                Box::new(|r| r.previous_response_id = Some("prev-id".into())),
@@ -2192,28 +2169,7 @@ mod tests {
                    })
                }),
            ),
-            (
-                "reasoning",
-                Box::new(|r| r.reasoning = Some(Default::default())),
-            ),
-            (
-                "service_tier",
-                Box::new(|r| r.service_tier = Some(ServiceTier::Auto)),
-            ),
            ("store", Box::new(|r| r.store = Some(true))),
-            (
-                "text",
-                Box::new(|r| {
-                    r.text = Some(TextConfig {
-                        format: TextResponseFormat::Text,
-                        verbosity: None,
-                    })
-                }),
-            ),
-            (
-                "truncation",
-                Box::new(|r| r.truncation = Some(Truncation::Auto)),
-            ),
        ];

        for (field, set_field) in unsupported_cases {

--- a/lib/llm/src/protocols/openai/responses/mod.rs
+++ b/lib/llm/src/protocols/openai/responses/mod.rs
@@ -4,11 +4,12 @@
 pub mod stream_converter;

 use dynamo_async_openai::types::responses::{
-    AssistantRole, FunctionCallOutput, FunctionToolCall, InputContent, InputItem, InputParam,
-    InputRole, Instructions, Item, MessageItem, OutputItem, OutputMessage, OutputMessageContent,
-    OutputStatus, OutputTextContent, Response, ResponseTextParam, Role as ResponseRole,
-    ServiceTier, Status, TextResponseFormatConfiguration, Tool, ToolChoiceOptions, ToolChoiceParam,
-    Truncation,
+    AssistantRole, FunctionCallOutput, FunctionToolCall, IncludeEnum, InputContent, InputItem,
+    InputParam, InputRole, InputTokenDetails, Instructions, Item, MessageItem, OutputItem,
+    OutputMessage, OutputMessageContent, OutputStatus, OutputTextContent, OutputTokenDetails,
+    Reasoning, ReasoningItem, Response, ResponseTextParam, ResponseUsage, Role as ResponseRole,
+    ServiceTier, Status, Summary, SummaryPart, TextResponseFormatConfiguration, Tool,
+    ToolChoiceOptions, ToolChoiceParam, Truncation,
 };
 use dynamo_async_openai::types::{
    ChatCompletionMessageToolCall, ChatCompletionNamedToolChoice,
@@ -20,7 +21,8 @@ use dynamo_async_openai::types::{
    ChatCompletionRequestUserMessage, ChatCompletionRequestUserMessageContent,
    ChatCompletionRequestUserMessageContentPart, ChatCompletionTool,
    ChatCompletionToolChoiceOption, ChatCompletionToolType, CreateChatCompletionRequest,
-    FunctionName, FunctionObject, ImageDetail as ChatImageDetail, ImageUrl, VideoUrl,
+    FunctionName, FunctionObject, ImageDetail as ChatImageDetail, ImageUrl, ResponseFormat,
+    ServiceTier as ChatServiceTier, VideoUrl,
 };
 use dynamo_runtime::protocols::annotated::AnnotationsProvider;
 use serde::{Deserialize, Serialize};
@@ -207,18 +209,33 @@ fn convert_input_content_to_user_content(
            InputContent::InputFile(_) => {
                return Err(anyhow::anyhow!("File input content is not yet supported"));
            }
+            InputContent::OutputText(t) => {
+                chat_parts.push(ChatCompletionRequestUserMessageContentPart::Text(
+                    ChatCompletionRequestMessageContentPartText {
+                        text: t.text.clone(),
+                    },
+                ));
+            }
+            InputContent::Refusal(r) => {
+                chat_parts.push(ChatCompletionRequestUserMessageContentPart::Text(
+                    ChatCompletionRequestMessageContentPartText {
+                        text: r.refusal.clone(),
+                    },
+                ));
+            }
        }
    }
    Ok(ChatCompletionRequestUserMessageContent::Array(chat_parts))
 }

-/// Convert a slice of InputContent to a plain text string (for system/developer messages).
+/// Convert a slice of InputContent to a plain text string (for system/developer/assistant messages).
 fn convert_input_content_to_text(content: &[InputContent]) -> String {
-    // Concatenate all text parts; non-text parts are skipped.
    content
        .iter()
        .filter_map(|p| match p {
            InputContent::InputText(t) => Some(t.text.as_str()),
+            InputContent::OutputText(t) => Some(t.text.as_str()),
+            InputContent::Refusal(r) => Some(r.refusal.as_str()),
            _ => None,
        })
        .collect::<Vec<_>>()
@@ -424,6 +441,29 @@ fn convert_tool_choice(tc: &ToolChoiceParam) -> ChatCompletionToolChoiceOption {
    }
 }

+/// Convert Responses API `text.format` to Chat Completions `response_format`.
+fn convert_text_format(text: &ResponseTextParam) -> Option<ResponseFormat> {
+    match &text.format {
+        TextResponseFormatConfiguration::Text => None,
+        TextResponseFormatConfiguration::JsonObject => Some(ResponseFormat::JsonObject),
+        TextResponseFormatConfiguration::JsonSchema(s) => Some(ResponseFormat::JsonSchema {
+            json_schema: s.clone(),
+        }),
+    }
+}
+
+/// Convert Responses API `ServiceTier` to Chat Completions `ServiceTier`.
+/// These are structurally identical enums in different modules.
+fn convert_service_tier(tier: &ServiceTier) -> ChatServiceTier {
+    match tier {
+        ServiceTier::Auto => ChatServiceTier::Auto,
+        ServiceTier::Default => ChatServiceTier::Default,
+        ServiceTier::Flex => ChatServiceTier::Flex,
+        ServiceTier::Scale => ChatServiceTier::Scale,
+        ServiceTier::Priority => ChatServiceTier::Priority,
+    }
+}
+
 impl TryFrom<NvCreateResponse> for NvCreateChatCompletionRequest {
    type Error = anyhow::Error;

@@ -472,6 +512,15 @@ impl TryFrom<NvCreateResponse> for NvCreateChatCompletionRequest {
        // Determine stream setting: respect caller's preference, default to true for aggregation
        let stream = resp.inner.stream.or(Some(true));

+        // Map reasoning.effort to reasoning_effort
+        let reasoning_effort = resp.inner.reasoning.as_ref().and_then(|r| r.effort.clone());
+
+        // Map text.format to response_format
+        let response_format = resp.inner.text.as_ref().and_then(convert_text_format);
+
+        // Map service_tier
+        let service_tier = resp.inner.service_tier.as_ref().map(convert_service_tier);
+
        Ok(NvCreateChatCompletionRequest {
            inner: CreateChatCompletionRequest {
                messages,
@@ -484,6 +533,9 @@ impl TryFrom<NvCreateResponse> for NvCreateChatCompletionRequest {
                stream,
                tools,
                tool_choice,
+                reasoning_effort,
+                response_format,
+                service_tier,
                ..Default::default()
            },
            common: Default::default(),
@@ -578,6 +630,7 @@ fn strip_tool_call_text(text: &str) -> std::borrow::Cow<'_, str> {
 /// response objects reflect actual request values.
 #[derive(Clone, Debug, Default)]
 pub struct ResponseParams {
+    pub model: Option<String>,
    pub temperature: Option<f32>,
    pub top_p: Option<f32>,
    pub max_output_tokens: Option<u32>,
@@ -585,6 +638,11 @@ pub struct ResponseParams {
    pub tools: Option<Vec<Tool>>,
    pub tool_choice: Option<ToolChoiceParam>,
    pub instructions: Option<String>,
+    pub reasoning: Option<Reasoning>,
+    pub text: Option<ResponseTextParam>,
+    pub service_tier: Option<ServiceTier>,
+    pub include: Option<Vec<IncludeEnum>>,
+    pub truncation: Option<Truncation>,
 }

 /// Normalize tools so that `FunctionTool.strict` is always set.
@@ -610,9 +668,9 @@ pub(super) fn normalize_tools(tools: Vec<Tool>) -> Vec<Tool> {
 /// Build an assistant text message output item.
 fn make_text_message(id: String, text: String) -> OutputItem {
    OutputItem::Message(OutputMessage {
-        id,
+        id: Some(id),
        role: AssistantRole::Assistant,
-        status: OutputStatus::Completed,
+        status: Some(OutputStatus::Completed),
        content: vec![OutputMessageContent::OutputText(OutputTextContent {
            text,
            annotations: vec![],
@@ -660,6 +718,21 @@ pub fn chat_completion_to_response(
            }
        }

+        // Map reasoning_content to a Reasoning output item
+        if let Some(reasoning_text) = choice.message.reasoning_content
+            && !reasoning_text.is_empty()
+        {
+            output.push(OutputItem::Reasoning(ReasoningItem {
+                id: format!("rs_{}", Uuid::new_v4().simple()),
+                summary: vec![SummaryPart::SummaryText(Summary {
+                    text: reasoning_text,
+                })],
+                content: None,
+                encrypted_content: None,
+                status: Some(OutputStatus::Completed),
+            }));
+        }
+
        // Handle text content -- also parse <tool_call> blocks from models
        // that emit tool calls as text (e.g. Qwen3)
        let content_text = match choice.message.content {
@@ -702,13 +775,35 @@ pub fn chat_completion_to_response(
        output.push(make_text_message(message_id, String::new()));
    }

+    // Apply `include` filtering: strip logprobs from output text unless
+    // the caller explicitly requested them via `message.output_text.logprobs`.
+    let keep_logprobs = params
+        .include
+        .as_ref()
+        .is_some_and(|inc| inc.contains(&IncludeEnum::MessageOutputTextLogprobs));
+    if !keep_logprobs {
+        for item in &mut output {
+            if let OutputItem::Message(msg) = item {
+                for content in &mut msg.content {
+                    if let OutputMessageContent::OutputText(text) = content {
+                        text.logprobs = None;
+                    }
+                }
+            }
+        }
+    }
+
    let created_at = chat_resp.created as u64;
    let response = Response {
        id: response_id,
        object: "response".to_string(),
        created_at,
        completed_at: Some(created_at),
-        model: chat_resp.model,
+        model: if chat_resp.model == "unknown" {
+            params.model.clone().unwrap_or(chat_resp.model)
+        } else {
+            chat_resp.model
+        },
        status: Status::Completed,
        output,
        // Spec-required defaults (OpenResponses requires these as non-null)
@@ -721,10 +816,10 @@ pub fn chat_completion_to_response(
        // store: false because this branch does not persist responses.
        store: params.store.or(Some(false)),
        temperature: params.temperature.or(Some(1.0)),
-        text: Some(ResponseTextParam {
+        text: Some(params.text.clone().unwrap_or(ResponseTextParam {
            format: TextResponseFormatConfiguration::Text,
            verbosity: None,
-        }),
+        })),
        tool_choice: params
            .tool_choice
            .clone()
@@ -737,7 +832,7 @@ pub fn chat_completion_to_response(
                .unwrap_or_default(),
        ),
        top_p: params.top_p.or(Some(1.0)),
-        truncation: Some(Truncation::Disabled),
+        truncation: Some(params.truncation.unwrap_or(Truncation::Disabled)),
        // Nullable but required to be present (null is valid)
        billing: None,
        conversation: None,
@@ -750,11 +845,27 @@ pub fn chat_completion_to_response(
        prompt: None,
        prompt_cache_key: None,
        prompt_cache_retention: None,
-        reasoning: None,
+        reasoning: params.reasoning.clone(),
        safety_identifier: None,
-        service_tier: Some(ServiceTier::Auto),
+        service_tier: Some(params.service_tier.unwrap_or(ServiceTier::Auto)),
        top_logprobs: Some(0),
-        usage: None,
+        usage: chat_resp.usage.map(|u| ResponseUsage {
+            input_tokens: u.prompt_tokens,
+            input_tokens_details: InputTokenDetails {
+                cached_tokens: u
+                    .prompt_tokens_details
+                    .map(|d| d.cached_tokens.unwrap_or(0))
+                    .unwrap_or(0),
+            },
+            output_tokens: u.completion_tokens,
+            output_tokens_details: OutputTokenDetails {
+                reasoning_tokens: u
+                    .completion_tokens_details
+                    .map(|d| d.reasoning_tokens.unwrap_or(0))
+                    .unwrap_or(0),
+            },
+            total_tokens: u.total_tokens,
+        }),
    };

    Ok(NvResponse {
@@ -896,9 +1007,9 @@ mod tests {
                        status: None,
                    }))),
                    InputItem::Item(Item::Message(MessageItem::Output(OutputMessage {
-                        id: "msg_1".into(),
+                        id: Some("msg_1".into()),
                        role: AssistantRole::Assistant,
-                        status: OutputStatus::Completed,
+                        status: Some(OutputStatus::Completed),
                        content: vec![OutputMessageContent::OutputText(OutputTextContent {
                            text: "4".into(),
                            annotations: vec![],
@@ -1209,4 +1320,334 @@ thinking
        assert!(!stripped.contains("<tool_call>"));
        assert!(!stripped.contains("<think>"));
    }
+
+    // ── PR1: reasoning / text.format / service_tier pass-through tests ──
+
+    #[test]
+    fn test_reasoning_effort_mapped_to_chat_completion() {
+        use dynamo_async_openai::types::ReasoningEffort;
+        use dynamo_async_openai::types::responses::Reasoning;
+
+        let mut req = make_response_with_input("think hard");
+        req.inner.reasoning = Some(Reasoning {
+            effort: Some(ReasoningEffort::Medium),
+            ..Default::default()
+        });
+
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(chat.inner.reasoning_effort, Some(ReasoningEffort::Medium));
+    }
+
+    #[test]
+    fn test_reasoning_none_leaves_chat_field_none() {
+        let req = make_response_with_input("no reasoning");
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(chat.inner.reasoning_effort, None);
+    }
+
+    #[test]
+    fn test_text_format_json_object_mapped() {
+        use dynamo_async_openai::types::ResponseFormat;
+        use dynamo_async_openai::types::responses::{
+            ResponseTextParam, TextResponseFormatConfiguration,
+        };
+
+        let mut req = make_response_with_input("give json");
+        req.inner.text = Some(ResponseTextParam {
+            format: TextResponseFormatConfiguration::JsonObject,
+            verbosity: None,
+        });
+
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(chat.inner.response_format, Some(ResponseFormat::JsonObject));
+    }
+
+    #[test]
+    fn test_text_format_json_schema_mapped() {
+        use dynamo_async_openai::types::responses::{
+            ResponseTextParam, TextResponseFormatConfiguration,
+        };
+        use dynamo_async_openai::types::{ResponseFormat, ResponseFormatJsonSchema};
+
+        let schema = ResponseFormatJsonSchema {
+            name: "city".into(),
+            description: None,
+            schema: Some(serde_json::json!({"type": "object"})),
+            strict: Some(true),
+        };
+        let mut req = make_response_with_input("structured");
+        req.inner.text = Some(ResponseTextParam {
+            format: TextResponseFormatConfiguration::JsonSchema(schema.clone()),
+            verbosity: None,
+        });
+
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(
+            chat.inner.response_format,
+            Some(ResponseFormat::JsonSchema {
+                json_schema: schema
+            })
+        );
+    }
+
+    #[test]
+    fn test_text_format_plain_text_leaves_response_format_none() {
+        use dynamo_async_openai::types::responses::{
+            ResponseTextParam, TextResponseFormatConfiguration,
+        };
+
+        let mut req = make_response_with_input("plain");
+        req.inner.text = Some(ResponseTextParam {
+            format: TextResponseFormatConfiguration::Text,
+            verbosity: None,
+        });
+
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(chat.inner.response_format, None);
+    }
+
+    #[test]
+    fn test_service_tier_mapped_to_chat_completion() {
+        use dynamo_async_openai::types::ServiceTier as ChatServiceTier;
+        use dynamo_async_openai::types::responses::ServiceTier as RespServiceTier;
+
+        let mut req = make_response_with_input("priority");
+        req.inner.service_tier = Some(RespServiceTier::Priority);
+
+        let chat: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert_eq!(chat.inner.service_tier, Some(ChatServiceTier::Priority));
+    }
+
+    #[test]
+    fn test_response_echoes_reasoning() {
+        use dynamo_async_openai::types::ReasoningEffort;
+        use dynamo_async_openai::types::responses::Reasoning;
+
+        let params = ResponseParams {
+            reasoning: Some(Reasoning {
+                effort: Some(ReasoningEffort::High),
+                ..Default::default()
+            }),
+            ..Default::default()
+        };
+
+        let chat_resp = NvCreateChatCompletionResponse {
+            choices: vec![],
+            created: 0,
+            id: "test".into(),
+            model: "m".into(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion".into(),
+            usage: None,
+            nvext: None,
+        };
+
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let reasoning = resp.inner.reasoning.unwrap();
+        assert_eq!(reasoning.effort, Some(ReasoningEffort::High));
+    }
+
+    #[test]
+    fn test_response_echoes_text_format() {
+        use dynamo_async_openai::types::responses::{
+            ResponseTextParam, TextResponseFormatConfiguration,
+        };
+
+        let params = ResponseParams {
+            text: Some(ResponseTextParam {
+                format: TextResponseFormatConfiguration::JsonObject,
+                verbosity: None,
+            }),
+            ..Default::default()
+        };
+
+        let chat_resp = NvCreateChatCompletionResponse {
+            choices: vec![],
+            created: 0,
+            id: "test".into(),
+            model: "m".into(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion".into(),
+            usage: None,
+            nvext: None,
+        };
+
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        let text = resp.inner.text.unwrap();
+        assert_eq!(text.format, TextResponseFormatConfiguration::JsonObject);
+    }
+
+    #[test]
+    fn test_response_echoes_service_tier() {
+        use dynamo_async_openai::types::responses::ServiceTier;
+
+        let params = ResponseParams {
+            service_tier: Some(ServiceTier::Flex),
+            ..Default::default()
+        };
+
+        let chat_resp = NvCreateChatCompletionResponse {
+            choices: vec![],
+            created: 0,
+            id: "test".into(),
+            model: "m".into(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion".into(),
+            usage: None,
+            nvext: None,
+        };
+
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        assert_eq!(resp.inner.service_tier, Some(ServiceTier::Flex));
+    }
+
+    #[test]
+    fn test_output_message_deserializes_without_id_and_status() {
+        use dynamo_async_openai::types::responses::{InputItem, Item, MessageItem};
+
+        let json = serde_json::json!({
+            "role": "assistant",
+            "content": [{"type": "output_text", "text": "Hello!", "annotations": []}],
+            "type": "message"
+        });
+
+        let item: InputItem = serde_json::from_value(json).unwrap();
+        match item {
+            InputItem::Item(Item::Message(MessageItem::Output(msg))) => {
+                assert_eq!(msg.role, AssistantRole::Assistant);
+                assert_eq!(msg.content.len(), 1);
+                assert!(msg.id.is_none());
+                assert_eq!(msg.status, None);
+            }
+            other => panic!("Expected Item::Message(Output), got {:?}", other),
+        }
+    }
+
+    #[test]
+    fn test_output_message_with_id_and_status_still_works() {
+        use dynamo_async_openai::types::responses::{InputItem, Item, MessageItem, OutputStatus};
+
+        let json = serde_json::json!({
+            "role": "assistant",
+            "id": "msg_abc123",
+            "status": "completed",
+            "content": [{"type": "output_text", "text": "Hello!", "annotations": []}],
+            "type": "message"
+        });
+
+        let item: InputItem = serde_json::from_value(json).unwrap();
+        match item {
+            InputItem::Item(Item::Message(MessageItem::Output(msg))) => {
+                assert_eq!(msg.id.as_deref(), Some("msg_abc123"));
+                assert_eq!(msg.status, Some(OutputStatus::Completed));
+            }
+            other => panic!("Expected Item::Message(Output), got {:?}", other),
+        }
+    }
+
+    // ── PR2: include filtering + truncation echo-back tests ──
+
+    fn make_chat_resp_with_text(text: &str) -> NvCreateChatCompletionResponse {
+        use dynamo_async_openai::types::{
+            ChatChoice, ChatCompletionMessageContent, ChatCompletionResponseMessage, FinishReason,
+        };
+        NvCreateChatCompletionResponse {
+            choices: vec![ChatChoice {
+                index: 0,
+                #[allow(deprecated)]
+                message: ChatCompletionResponseMessage {
+                    content: Some(ChatCompletionMessageContent::Text(text.into())),
+                    role: dynamo_async_openai::types::Role::Assistant,
+                    tool_calls: None,
+                    refusal: None,
+                    reasoning_content: None,
+                    function_call: None,
+                    audio: None,
+                },
+                finish_reason: Some(FinishReason::Stop),
+                stop_reason: None,
+                logprobs: None,
+            }],
+            created: 0,
+            id: "test".into(),
+            model: "m".into(),
+            service_tier: None,
+            system_fingerprint: None,
+            object: "chat.completion".into(),
+            usage: None,
+            nvext: None,
+        }
+    }
+
+    #[test]
+    fn test_include_logprobs_stripped_by_default() {
+        let chat_resp = make_chat_resp_with_text("hello");
+        let params = ResponseParams::default();
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+
+        for item in &resp.inner.output {
+            if let OutputItem::Message(msg) = item {
+                for content in &msg.content {
+                    if let OutputMessageContent::OutputText(t) = content {
+                        assert!(
+                            t.logprobs.is_none(),
+                            "logprobs should be stripped by default"
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    #[test]
+    fn test_include_logprobs_kept_when_requested() {
+        use dynamo_async_openai::types::responses::IncludeEnum;
+
+        let chat_resp = make_chat_resp_with_text("hello");
+        let params = ResponseParams {
+            include: Some(vec![IncludeEnum::MessageOutputTextLogprobs]),
+            ..Default::default()
+        };
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+
+        let mut found_text = false;
+        for item in &resp.inner.output {
+            if let OutputItem::Message(msg) = item {
+                for content in &msg.content {
+                    if let OutputMessageContent::OutputText(t) = content {
+                        found_text = true;
+                        assert!(
+                            t.logprobs.is_some(),
+                            "logprobs should be preserved when included"
+                        );
+                    }
+                }
+            }
+        }
+        assert!(found_text, "Expected text output");
+    }
+
+    #[test]
+    fn test_truncation_auto_echoed_back() {
+        use dynamo_async_openai::types::responses::Truncation;
+
+        let chat_resp = make_chat_resp_with_text("hello");
+        let params = ResponseParams {
+            truncation: Some(Truncation::Auto),
+            ..Default::default()
+        };
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        assert_eq!(resp.inner.truncation, Some(Truncation::Auto));
+    }
+
+    #[test]
+    fn test_truncation_defaults_to_disabled() {
+        let chat_resp = make_chat_resp_with_text("hello");
+        let params = ResponseParams::default();
+        let resp = chat_completion_to_response(chat_resp, &params).unwrap();
+        assert_eq!(resp.inner.truncation, Some(Truncation::Disabled));
+    }
 }
--- a/lib/llm/src/protocols/openai/responses/stream_converter.rs
+++ b/lib/llm/src/protocols/openai/responses/stream_converter.rs
@@ -13,14 +13,14 @@ use std::time::{SystemTime, UNIX_EPOCH};

 use axum::response::sse::Event;
 use dynamo_async_openai::types::responses::{
-    AssistantRole, FunctionToolCall, Instructions, OutputContent, OutputItem, OutputMessage,
-    OutputMessageContent, OutputStatus, OutputTextContent, Response, ResponseCompletedEvent,
-    ResponseContentPartAddedEvent, ResponseContentPartDoneEvent, ResponseCreatedEvent,
-    ResponseFailedEvent, ResponseFunctionCallArgumentsDeltaEvent,
+    AssistantRole, FunctionToolCall, InputTokenDetails, Instructions, OutputContent, OutputItem,
+    OutputMessage, OutputMessageContent, OutputStatus, OutputTextContent, OutputTokenDetails,
+    Response, ResponseCompletedEvent, ResponseContentPartAddedEvent, ResponseContentPartDoneEvent,
+    ResponseCreatedEvent, ResponseFailedEvent, ResponseFunctionCallArgumentsDeltaEvent,
    ResponseFunctionCallArgumentsDoneEvent, ResponseInProgressEvent, ResponseOutputItemAddedEvent,
    ResponseOutputItemDoneEvent, ResponseStreamEvent, ResponseTextDeltaEvent,
-    ResponseTextDoneEvent, ResponseTextParam, ServiceTier, Status, TextResponseFormatConfiguration,
-    ToolChoiceOptions, ToolChoiceParam, Truncation,
+    ResponseTextDoneEvent, ResponseTextParam, ResponseUsage, ServiceTier, Status,
+    TextResponseFormatConfiguration, ToolChoiceOptions, ToolChoiceParam, Truncation,
 };
 use uuid::Uuid;

@@ -45,6 +45,8 @@ pub struct ResponseStreamConverter {
    function_call_items: Vec<FunctionCallState>,
    // Output index counter
    next_output_index: u32,
+    // Usage stats from the backend's final chunk
+    usage: Option<ResponseUsage>,
 }

 struct FunctionCallState {
@@ -75,6 +77,7 @@ impl ResponseStreamConverter {
            accumulated_text: String::new(),
            function_call_items: Vec::new(),
            next_output_index: 0,
+            usage: None,
        }
    }

@@ -112,10 +115,10 @@ impl ResponseStreamConverter {
            // store: false because this branch does not persist responses.
            store: self.params.store.or(Some(false)),
            temperature: self.params.temperature.or(Some(1.0)),
-            text: Some(ResponseTextParam {
+            text: Some(self.params.text.clone().unwrap_or(ResponseTextParam {
                format: TextResponseFormatConfiguration::Text,
                verbosity: None,
-            }),
+            })),
            tool_choice: self
                .params
                .tool_choice
@@ -129,7 +132,7 @@ impl ResponseStreamConverter {
                    .unwrap_or_default(),
            ),
            top_p: self.params.top_p.or(Some(1.0)),
-            truncation: Some(Truncation::Disabled),
+            truncation: Some(self.params.truncation.unwrap_or(Truncation::Disabled)),
            // Nullable required fields
            billing: None,
            conversation: None,
@@ -142,11 +145,11 @@ impl ResponseStreamConverter {
            prompt: None,
            prompt_cache_key: None,
            prompt_cache_retention: None,
-            reasoning: None,
+            reasoning: self.params.reasoning.clone(),
            safety_identifier: None,
-            service_tier: Some(ServiceTier::Auto),
+            service_tier: Some(self.params.service_tier.unwrap_or(ServiceTier::Auto)),
            top_logprobs: Some(0),
-            usage: None,
+            usage: self.usage.clone(),
        }
    }

@@ -176,6 +179,29 @@ impl ResponseStreamConverter {
    ) -> Vec<Result<Event, anyhow::Error>> {
        let mut events = Vec::new();

+        // Capture usage stats from the final chunk (sent when stream_options.include_usage=true)
+        if let Some(ref u) = chunk.usage {
+            self.usage = Some(ResponseUsage {
+                input_tokens: u.prompt_tokens,
+                input_tokens_details: InputTokenDetails {
+                    cached_tokens: u
+                        .prompt_tokens_details
+                        .as_ref()
+                        .and_then(|d| d.cached_tokens)
+                        .unwrap_or(0),
+                },
+                output_tokens: u.completion_tokens,
+                output_tokens_details: OutputTokenDetails {
+                    reasoning_tokens: u
+                        .completion_tokens_details
+                        .as_ref()
+                        .and_then(|d| d.reasoning_tokens)
+                        .unwrap_or(0),
+                },
+                total_tokens: u.total_tokens,
+            });
+        }
+
        for choice in &chunk.choices {
            let delta = &choice.delta;

@@ -203,10 +229,10 @@ impl ResponseStreamConverter {
                            sequence_number: self.next_seq(),
                            output_index,
                            item: OutputItem::Message(OutputMessage {
-                                id: self.message_item_id.clone(),
+                                id: Some(self.message_item_id.clone()),
                                content: vec![],
                                role: AssistantRole::Assistant,
-                                status: OutputStatus::InProgress,
+                                status: Some(OutputStatus::InProgress),
                            }),
                        },
                    );
@@ -354,14 +380,14 @@ impl ResponseStreamConverter {
                    sequence_number: self.next_seq(),
                    output_index: self.message_output_index,
                    item: OutputItem::Message(OutputMessage {
-                        id: self.message_item_id.clone(),
+                        id: Some(self.message_item_id.clone()),
                        content: vec![OutputMessageContent::OutputText(OutputTextContent {
                            text: self.accumulated_text.clone(),
                            annotations: vec![],
                            logprobs: Some(vec![]),
                        })],
                        role: AssistantRole::Assistant,
-                        status: OutputStatus::Completed,
+                        status: Some(OutputStatus::Completed),
                    }),
                });
            events.push(make_sse_event(&item_done));
@@ -413,14 +439,14 @@ impl ResponseStreamConverter {
        let mut output = Vec::new();
        if self.message_started {
            output.push(OutputItem::Message(OutputMessage {
-                id: self.message_item_id.clone(),
+                id: Some(self.message_item_id.clone()),
                content: vec![OutputMessageContent::OutputText(OutputTextContent {
                    text: self.accumulated_text.clone(),
                    annotations: vec![],
                    logprobs: Some(vec![]),
                })],
                role: AssistantRole::Assistant,
-                status: OutputStatus::Completed,
+                status: Some(OutputStatus::Completed),
            }));
        }
        for fc in &self.function_call_items {