refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)

2887cd1c · ishandhanani · GitHub · d6136f4a · 2887cd1c · 2887cd1c
Unverified Commit 2887cd1c authored Mar 30, 2026 by ishandhanani Committed by GitHub Mar 30, 2026
20 changed files
--- a/lib/async-openai/src/types/anthropic.rs
+++ b/lib/async-openai/src/types/anthropic.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Anthropic Messages API types.
+//!
+//! Pure protocol types for the `/v1/messages` endpoint -- request, response,
+//! streaming events, error shapes, and count-tokens types.
+
+use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
+/// Anthropic-style cache control hint for prefix pinning with TTL.
+#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
+pub struct CacheControl {
+    #[serde(rename = "type")]
+    pub control_type: CacheControlType,
+    /// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub ttl: Option<String>,
+}
+
+#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum CacheControlType {
+    #[default]
+    Ephemeral,
+    #[serde(other)]
+    Unknown,
+}
+
+const MIN_TTL_SECONDS: u64 = 300;
+const MAX_TTL_SECONDS: u64 = 3600;
+
+impl CacheControl {
+    /// Parse TTL string to seconds, clamped to [300, 3600].
+    ///
+    /// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
+    /// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
+    /// Unrecognized strings default to 300s.
+    pub fn ttl_seconds(&self) -> u64 {
+        let raw = match self.ttl.as_deref() {
+            None => return MIN_TTL_SECONDS,
+            Some("5m") => 300,
+            Some("1h") => 3600,
+            Some(other) => match other.parse::<u64>() {
+                Ok(secs) => secs,
+                Err(_) => {
+                    tracing::warn!("Unrecognized TTL '{}', defaulting to 300s", other);
+                    return MIN_TTL_SECONDS;
+                }
+            },
+        };
+        raw.clamp(MIN_TTL_SECONDS, MAX_TTL_SECONDS)
+    }
+}
+/// Parsed system prompt content, preserving cache_control from block arrays.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SystemContent {
+    /// The concatenated text from all system blocks (or the plain string).
+    pub text: String,
+    /// Cache control from the last system block that had one.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+}
+
+/// Deserialize `system` from either a plain string or an array of text blocks.
+/// The Anthropic API accepts both `"system": "text"` and
+/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
+fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    #[derive(Deserialize)]
+    #[serde(untagged)]
+    enum SystemPrompt {
+        Text(String),
+        Blocks(Vec<SystemBlock>),
+    }
+
+    #[derive(Deserialize)]
+    struct SystemBlock {
+        text: String,
+        #[serde(default)]
+        cache_control: Option<CacheControl>,
+    }
+
+    let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
+    Ok(maybe.map(|sp| match sp {
+        SystemPrompt::Text(s) => SystemContent {
+            text: s,
+            cache_control: None,
+        },
+        SystemPrompt::Blocks(blocks) => {
+            let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
+            let text = blocks
+                .into_iter()
+                .map(|b| b.text)
+                .collect::<Vec<_>>()
+                .join("\n");
+            SystemContent {
+                text,
+                cache_control,
+            }
+        }
+    }))
+}
+/// Top-level request body for `POST /v1/messages`.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicCreateMessageRequest {
+    /// The model to use (e.g. "claude-sonnet-4-20250514").
+    pub model: String,
+
+    /// The maximum number of tokens to generate.
+    pub max_tokens: u32,
+
+    /// The conversation messages.
+    pub messages: Vec<AnthropicMessage>,
+
+    /// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        deserialize_with = "deserialize_system_prompt"
+    )]
+    pub system: Option<SystemContent>,
+
+    /// Sampling temperature (0.0 - 1.0).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// Nucleus sampling parameter.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_p: Option<f32>,
+
+    /// Top-K sampling parameter.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub top_k: Option<u32>,
+
+    /// Custom stop sequences.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_sequences: Option<Vec<String>>,
+
+    /// Whether to stream the response.
+    #[serde(default)]
+    pub stream: bool,
+
+    /// Optional metadata (e.g. user_id).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<serde_json::Value>,
+
+    /// Tools the model may call.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<AnthropicTool>>,
+
+    /// How the model should choose which tool to call.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<AnthropicToolChoice>,
+
+    /// Top-level cache control for automatic prompt prefix caching.
+    /// When present, the system caches all content up to the last cacheable block.
+    /// Matches the Anthropic Messages API automatic caching mode.
+    /// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+
+    /// Extended thinking configuration. When enabled, the model produces
+    /// `thinking` content blocks containing its internal reasoning before
+    /// the final response. The `budget_tokens` field controls how many tokens
+    /// the model may use for thinking (must be >= 1024 and < max_tokens).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub thinking: Option<ThinkingConfig>,
+
+    /// Service tier selection: `"auto"` or `"standard_only"`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<String>,
+
+    /// Container identifier for stateful sandbox sessions.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub container: Option<String>,
+
+    /// Output configuration: effort level and optional JSON schema format.
+    /// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
+    /// `format` specifies structured JSON output constraints.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub output_config: Option<serde_json::Value>,
+}
+
+/// Extended thinking configuration for the request.
+///
+/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
+/// with its internal reasoning. `budget_tokens` controls the maximum tokens
+/// available for thinking (minimum 1024, must be less than `max_tokens`).
+/// When `type` is `"disabled"`, no thinking blocks are produced.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ThinkingConfig {
+    /// Either `"enabled"` or `"disabled"`.
+    #[serde(rename = "type")]
+    pub thinking_type: String,
+    /// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub budget_tokens: Option<u32>,
+}
+
+/// A single message in the conversation.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicMessage {
+    pub role: AnthropicRole,
+    #[serde(flatten)]
+    pub content: AnthropicMessageContent,
+}
+
+/// The role of a message sender.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum AnthropicRole {
+    User,
+    Assistant,
+}
+
+/// Message content -- either a plain string or an array of content blocks.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum AnthropicMessageContent {
+    /// Plain text content.
+    Text { content: String },
+    /// Array of structured content blocks.
+    Blocks { content: Vec<AnthropicContentBlock> },
+}
+
+/// A single content block within a message.
+///
+/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
+/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
+/// of causing a hard deserialization failure. This is important because Claude
+/// Code may send block types that we don't yet handle.
+#[derive(Debug, Clone, Serialize)]
+#[serde(tag = "type")]
+pub enum AnthropicContentBlock {
+    /// Text content block. May optionally include `citations` -- references to
+    /// source documents that support the text content. Citations are generated
+    /// by the model when document/PDF content is provided and citation mode is enabled.
+    #[serde(rename = "text")]
+    Text {
+        text: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        citations: Option<Vec<serde_json::Value>>,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+    /// Image content block.
+    #[serde(rename = "image")]
+    Image { source: AnthropicImageSource },
+    /// Tool use request from assistant.
+    #[serde(rename = "tool_use")]
+    ToolUse {
+        id: String,
+        name: String,
+        input: serde_json::Value,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+    /// Tool result from user.
+    #[serde(rename = "tool_result")]
+    ToolResult {
+        tool_use_id: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        content: Option<ToolResultContent>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        is_error: Option<bool>,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+    /// Thinking content block from assistant (extended thinking / reasoning).
+    #[serde(rename = "thinking")]
+    Thinking {
+        thinking: String,
+        signature: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
+    /// Redacted thinking block from assistant. Contains encrypted reasoning data
+    /// that is opaque to the client but must be passed back verbatim in multi-turn
+    /// conversations so the model can maintain its chain of thought.
+    #[serde(rename = "redacted_thinking")]
+    RedactedThinking { data: String },
+    /// Server-initiated tool use block. Represents a tool call that the API
+    /// executes server-side (e.g., web search). The client receives the result
+    /// via a corresponding `web_search_tool_result` or similar block.
+    #[serde(rename = "server_tool_use")]
+    ServerToolUse {
+        id: String,
+        name: String,
+        #[serde(default)]
+        input: serde_json::Value,
+    },
+    /// Result from a server-initiated tool (e.g., web search results).
+    /// Contains structured content returned by the server-side tool execution.
+    #[serde(rename = "web_search_tool_result")]
+    WebSearchToolResult {
+        tool_use_id: String,
+        #[serde(default)]
+        content: serde_json::Value,
+    },
+    /// Catch-all for unrecognized block types. Preserves the full JSON value
+    /// so that new Anthropic features don't break the endpoint and can be
+    /// round-tripped or inspected.
+    #[serde(untagged)]
+    Other(serde_json::Value),
+}
+
+/// Content of a `tool_result` block -- either a plain string or an array of
+/// content blocks (the Anthropic API accepts both).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ToolResultContent {
+    Text(String),
+    Blocks(Vec<ToolResultContentBlock>),
+}
+
+impl ToolResultContent {
+    /// Extract the text content, concatenating array blocks if needed.
+    pub fn into_text(self) -> String {
+        match self {
+            ToolResultContent::Text(s) => s,
+            ToolResultContent::Blocks(blocks) => blocks
+                .into_iter()
+                .filter_map(|b| match b {
+                    ToolResultContentBlock::Text { text } => Some(text),
+                    ToolResultContentBlock::Other(_) => None,
+                })
+                .collect::<Vec<_>>()
+                .join(""),
+        }
+    }
+}
+
+/// A content block within a `tool_result.content` array.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum ToolResultContentBlock {
+    Text {
+        text: String,
+    },
+    /// Catch-all for non-text blocks (images, etc.) in tool results.
+    Other(serde_json::Value),
+}
+
+/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
+/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
+/// tagged enums, we deserialize as `Value` first and dispatch manually.
+impl<'de> Deserialize<'de> for AnthropicContentBlock {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let value = serde_json::Value::deserialize(deserializer)?;
+        let block_type = value
+            .get("type")
+            .and_then(|t| t.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        match block_type.as_str() {
+            "text" => {
+                let text = value
+                    .get("text")
+                    .and_then(|t| t.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("text"))?
+                    .to_string();
+                let citations: Option<Vec<serde_json::Value>> = value
+                    .get("citations")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::Text {
+                    text,
+                    citations,
+                    cache_control,
+                })
+            }
+            "image" => {
+                let source: AnthropicImageSource =
+                    serde_json::from_value(value.get("source").cloned().unwrap_or_default())
+                        .map_err(serde::de::Error::custom)?;
+                Ok(AnthropicContentBlock::Image { source })
+            }
+            "tool_use" => {
+                let id = value
+                    .get("id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
+                    .to_string();
+                let name = value
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
+                    .to_string();
+                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::ToolUse {
+                    id,
+                    name,
+                    input,
+                    cache_control,
+                })
+            }
+            "tool_result" => {
+                let tool_use_id = value
+                    .get("tool_use_id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
+                    .to_string();
+                let content: Option<ToolResultContent> = value
+                    .get("content")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                let is_error = value.get("is_error").and_then(|v| v.as_bool());
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::ToolResult {
+                    tool_use_id,
+                    content,
+                    is_error,
+                    cache_control,
+                })
+            }
+            "thinking" => {
+                let thinking = value
+                    .get("thinking")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("thinking"))?
+                    .to_string();
+                let signature = value
+                    .get("signature")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("signature"))?
+                    .to_string();
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::Thinking {
+                    thinking,
+                    signature,
+                    cache_control,
+                })
+            }
+            "redacted_thinking" => {
+                let data = value
+                    .get("data")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("data"))?
+                    .to_string();
+                Ok(AnthropicContentBlock::RedactedThinking { data })
+            }
+            "server_tool_use" => {
+                let id = value
+                    .get("id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
+                    .to_string();
+                let name = value
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
+                    .to_string();
+                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
+                Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
+            }
+            "web_search_tool_result" => {
+                let tool_use_id = value
+                    .get("tool_use_id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
+                    .to_string();
+                let content = value
+                    .get("content")
+                    .cloned()
+                    .unwrap_or(serde_json::json!([]));
+                Ok(AnthropicContentBlock::WebSearchToolResult {
+                    tool_use_id,
+                    content,
+                })
+            }
+            other => {
+                tracing::debug!(
+                    "Unrecognized Anthropic content block type '{}', preserving as Other",
+                    other
+                );
+                Ok(AnthropicContentBlock::Other(value))
+            }
+        }
+    }
+}
+
+/// Image source for image content blocks.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicImageSource {
+    #[serde(rename = "type")]
+    pub source_type: String,
+    pub media_type: String,
+    pub data: String,
+}
+
+/// A tool definition.
+///
+/// Client tools (custom) require `name` + `input_schema`. Server tools
+/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
+/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
+/// `input_schema`. We keep all fields optional beyond `name` so both
+/// kinds deserialize successfully and pass through to the backend.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicTool {
+    /// Tool name (required for client tools, present on server tools too).
+    pub name: String,
+    /// Tool type discriminator. Client tools use `"custom"` (or omit).
+    /// Server tools use versioned types like `"web_search_20260209"`.
+    #[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
+    pub tool_type: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub description: Option<String>,
+    /// JSON Schema for the tool input. Required for client tools, absent on
+    /// server tools (which define their own input shape server-side).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input_schema: Option<serde_json::Value>,
+    /// Cache control breakpoint on this tool definition.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+}
+
+/// Tool choice specification.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum AnthropicToolChoice {
+    /// Named tool: `{type: "tool", name: "..."}`
+    /// Must be listed before Simple so serde tries the stricter shape first.
+    Named(AnthropicToolChoiceNamed),
+    /// Simple mode: "auto", "any", or "none".
+    Simple(AnthropicToolChoiceSimple),
+}
+
+/// Simple tool choice modes.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicToolChoiceSimple {
+    #[serde(rename = "type")]
+    pub choice_type: AnthropicToolChoiceMode,
+    /// When true, the model will call tools one at a time instead of
+    /// potentially issuing multiple tool calls in a single response.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub disable_parallel_tool_use: Option<bool>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum AnthropicToolChoiceMode {
+    Auto,
+    Any,
+    None,
+    Tool,
+}
+
+/// Named tool choice.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicToolChoiceNamed {
+    #[serde(rename = "type")]
+    pub choice_type: AnthropicToolChoiceMode,
+    pub name: String,
+    /// When true, the model will call tools one at a time instead of
+    /// potentially issuing multiple tool calls in a single response.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub disable_parallel_tool_use: Option<bool>,
+}
+/// Response body for `POST /v1/messages` (non-streaming).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicMessageResponse {
+    pub id: String,
+    #[serde(rename = "type")]
+    pub object_type: String,
+    pub role: String,
+    pub content: Vec<AnthropicResponseContentBlock>,
+    pub model: String,
+    pub stop_reason: Option<AnthropicStopReason>,
+    pub stop_sequence: Option<String>,
+    pub usage: AnthropicUsage,
+}
+
+/// A content block in the response.
+///
+/// The Anthropic API returns up to 12 different block types. We model the
+/// common ones explicitly and catch the rest as `Other` so the proxy can
+/// forward them without losing data.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum AnthropicResponseContentBlock {
+    #[serde(rename = "thinking")]
+    Thinking { thinking: String, signature: String },
+    #[serde(rename = "text")]
+    Text {
+        text: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        citations: Option<Vec<serde_json::Value>>,
+    },
+    #[serde(rename = "tool_use")]
+    ToolUse {
+        id: String,
+        name: String,
+        input: serde_json::Value,
+    },
+    #[serde(rename = "redacted_thinking")]
+    RedactedThinking { data: String },
+    #[serde(rename = "server_tool_use")]
+    ServerToolUse {
+        id: String,
+        name: String,
+        #[serde(default)]
+        input: serde_json::Value,
+    },
+    #[serde(rename = "web_search_tool_result")]
+    WebSearchToolResult {
+        tool_use_id: String,
+        #[serde(default)]
+        content: serde_json::Value,
+    },
+    /// Catch-all for new/uncommon block types (web_fetch_tool_result,
+    /// code_execution_tool_result, container_upload, etc.) so the proxy
+    /// can serialize them back without data loss.
+    #[serde(untagged)]
+    Other(serde_json::Value),
+}
+
+/// Token usage information.
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub struct AnthropicUsage {
+    pub input_tokens: u32,
+    pub output_tokens: u32,
+    /// Number of input tokens used to create a new cache entry.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_creation_input_tokens: Option<u32>,
+    /// Number of input tokens read from the prompt cache (prefix cache hits).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_read_input_tokens: Option<u32>,
+}
+
+/// Reason the model stopped generating.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum AnthropicStopReason {
+    EndTurn,
+    MaxTokens,
+    StopSequence,
+    ToolUse,
+    /// The model paused to yield control in an agentic loop, intending to
+    /// continue in a subsequent turn. Used with extended thinking / tool use.
+    PauseTurn,
+    /// The model refused to generate content (safety refusal).
+    Refusal,
+}
+/// SSE event types for the Anthropic streaming API.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum AnthropicStreamEvent {
+    #[serde(rename = "message_start")]
+    MessageStart { message: AnthropicMessageResponse },
+
+    #[serde(rename = "content_block_start")]
+    ContentBlockStart {
+        index: u32,
+        content_block: AnthropicResponseContentBlock,
+    },
+
+    #[serde(rename = "content_block_delta")]
+    ContentBlockDelta { index: u32, delta: AnthropicDelta },
+
+    #[serde(rename = "content_block_stop")]
+    ContentBlockStop { index: u32 },
+
+    #[serde(rename = "message_delta")]
+    MessageDelta {
+        delta: AnthropicMessageDeltaBody,
+        usage: AnthropicUsage,
+    },
+
+    #[serde(rename = "message_stop")]
+    MessageStop {},
+
+    #[serde(rename = "ping")]
+    Ping {},
+
+    #[serde(rename = "error")]
+    Error { error: AnthropicErrorBody },
+}
+
+/// Delta content in a streaming content_block_delta event.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type")]
+pub enum AnthropicDelta {
+    #[serde(rename = "thinking_delta")]
+    ThinkingDelta { thinking: String },
+    #[serde(rename = "text_delta")]
+    TextDelta { text: String },
+    #[serde(rename = "input_json_delta")]
+    InputJsonDelta { partial_json: String },
+    /// Incremental signature for a thinking block (sent at the end).
+    #[serde(rename = "signature_delta")]
+    SignatureDelta { signature: String },
+    /// Incremental citation attached to a text block.
+    #[serde(rename = "citations_delta")]
+    CitationsDelta { citation: serde_json::Value },
+}
+
+/// The delta body in a message_delta event.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicMessageDeltaBody {
+    pub stop_reason: Option<AnthropicStopReason>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stop_sequence: Option<String>,
+}
+/// Anthropic API error response wrapper.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicErrorResponse {
+    #[serde(rename = "type")]
+    pub object_type: String,
+    pub error: AnthropicErrorBody,
+}
+
+/// Error body within an error response.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AnthropicErrorBody {
+    #[serde(rename = "type")]
+    pub error_type: String,
+    pub message: String,
+}
+
+impl AnthropicErrorResponse {
+    /// Create an `invalid_request_error` response.
+    pub fn invalid_request(message: impl Into<String>) -> Self {
+        Self {
+            object_type: "error".to_string(),
+            error: AnthropicErrorBody {
+                error_type: "invalid_request_error".to_string(),
+                message: message.into(),
+            },
+        }
+    }
+
+    /// Create an `api_error` (internal server error) response.
+    pub fn api_error(message: impl Into<String>) -> Self {
+        Self {
+            object_type: "error".to_string(),
+            error: AnthropicErrorBody {
+                error_type: "api_error".to_string(),
+                message: message.into(),
+            },
+        }
+    }
+
+    /// Create a `not_found_error` response.
+    pub fn not_found(message: impl Into<String>) -> Self {
+        Self {
+            object_type: "error".to_string(),
+            error: AnthropicErrorBody {
+                error_type: "not_found_error".to_string(),
+                message: message.into(),
+            },
+        }
+    }
+}
+/// Request body for `POST /v1/messages/count_tokens`.
+#[derive(Debug, Clone, Deserialize)]
+pub struct AnthropicCountTokensRequest {
+    pub model: String,
+    pub messages: Vec<AnthropicMessage>,
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        deserialize_with = "deserialize_system_prompt"
+    )]
+    pub system: Option<SystemContent>,
+    #[serde(default)]
+    pub tools: Option<Vec<AnthropicTool>>,
+}
+
+/// Response body for `POST /v1/messages/count_tokens`.
+#[derive(Debug, Clone, Serialize)]
+pub struct AnthropicCountTokensResponse {
+    pub input_tokens: u32,
+}
+
+impl AnthropicCountTokensRequest {
+    /// Estimate input token count using a `len/3` heuristic.
+    pub fn estimate_tokens(&self) -> u32 {
+        let mut total_len: usize = 0;
+
+        if let Some(system) = &self.system {
+            total_len += system.text.len();
+        }
+
+        for msg in &self.messages {
+            // Count role
+            total_len += match msg.role {
+                AnthropicRole::User => 4,
+                AnthropicRole::Assistant => 9,
+            };
+            // Count content
+            match &msg.content {
+                AnthropicMessageContent::Text { content } => total_len += content.len(),
+                AnthropicMessageContent::Blocks { content } => {
+                    for block in content {
+                        total_len += estimate_block_len(block);
+                    }
+                }
+            }
+        }
+
+        if let Some(tools) = &self.tools {
+            for tool in tools {
+                total_len += tool.name.len();
+                if let Some(desc) = &tool.description {
+                    total_len += desc.len();
+                }
+                if let Some(schema) = &tool.input_schema {
+                    total_len += schema.to_string().len();
+                }
+            }
+        }
+
+        let tokens = total_len / 3;
+        if tokens == 0 && total_len > 0 {
+            1
+        } else {
+            tokens as u32
+        }
+    }
+}
+
+fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
+    match block {
+        AnthropicContentBlock::Text { text, .. } => text.len(),
+        AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
+        AnthropicContentBlock::ToolResult { content, .. } => content
+            .as_ref()
+            .map(|c| match c {
+                ToolResultContent::Text(s) => s.len(),
+                ToolResultContent::Blocks(blocks) => blocks
+                    .iter()
+                    .map(|b| match b {
+                        ToolResultContentBlock::Text { text } => text.len(),
+                        ToolResultContentBlock::Other(v) => v.to_string().len(),
+                    })
+                    .sum(),
+            })
+            .unwrap_or(0),
+        AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
+        AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
+        AnthropicContentBlock::ServerToolUse { name, input, .. } => {
+            name.len() + input.to_string().len()
+        }
+        AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
+        AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
+        AnthropicContentBlock::Other(v) => v.to_string().len(),
+    }
+}
--- a/lib/async-openai/src/types/chat.rs
+++ b/lib/async-openai/src/types/chat.rs
@@ -1182,10 +1182,6 @@ pub struct CreateChatCompletionResponse {
    /// The object type, which is always `chat.completion`.
    pub object: String,
    pub usage: Option<CompletionUsage>,
-
-    /// NVIDIA extension field for response metadata (worker IDs, etc.)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub nvext: Option<serde_json::Value>,
 }

 /// Parsed server side events stream until an \[DONE\] is received from server.
@@ -1281,10 +1277,6 @@ pub struct CreateChatCompletionStreamResponse {
    /// An optional field that will only be present when you set `stream_options: {"include_usage": true}` in your request.
    /// When present, it contains a null value except for the last chunk which contains the token usage statistics for the entire request.
    pub usage: Option<CompletionUsage>,
-
-    /// NVIDIA extension field for response metadata
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub nvext: Option<serde_json::Value>,
 }

 #[cfg(test)]

--- a/lib/async-openai/src/types/completion.rs
+++ b/lib/async-openai/src/types/completion.rs
@@ -224,10 +224,6 @@ pub struct CreateCompletionResponse {
    /// The object type, which is always "text_completion"
    pub object: String,
    pub usage: Option<CompletionUsage>,
-
-    /// NVIDIA extension field for response metadata (worker IDs, etc.)
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub nvext: Option<serde_json::Value>,
 }

 /// Parsed server side events stream until an \[DONE\] is received from server.

--- a/lib/async-openai/src/types/mod.rs
+++ b/lib/async-openai/src/types/mod.rs
@@ -10,6 +10,7 @@

 //! Types used in OpenAI API requests and responses.
 //! These types are created from component schemas in the [OpenAPI spec](https://github.com/openai/openai-openapi)
+pub mod anthropic;
 mod assistant;
 mod assistant_impls;
 mod assistant_stream;

--- a/lib/llm/src/audit/stream.rs
+++ b/lib/llm/src/audit/stream.rs
@@ -90,14 +90,16 @@ where
                tracing::warn!("audit: aggregation future canceled/failed");
                // Return minimal response if aggregation failed
                NvCreateChatCompletionResponse {
-                    id: String::new(),
-                    created: 0,
-                    usage: None,
-                    model: String::new(),
-                    object: "chat.completion".to_string(),
-                    system_fingerprint: None,
-                    choices: vec![],
-                    service_tier: None,
+                    inner: dynamo_async_openai::types::CreateChatCompletionResponse {
+                        id: String::new(),
+                        created: 0,
+                        usage: None,
+                        model: String::new(),
+                        object: "chat.completion".to_string(),
+                        system_fingerprint: None,
+                        choices: vec![],
+                        service_tier: None,
+                    },
                    nvext: None,
                }
            })
@@ -125,14 +127,16 @@ where
            Err(e) => {
                tracing::warn!("fold aggregation failed: {e}");
                let fallback = NvCreateChatCompletionResponse {
-                    id: String::new(),
-                    created: 0,
-                    usage: None,
-                    model: String::new(),
-                    object: "chat.completion".to_string(),
-                    system_fingerprint: None,
-                    choices: vec![],
-                    service_tier: None,
+                    inner: dynamo_async_openai::types::CreateChatCompletionResponse {
+                        id: String::new(),
+                        created: 0,
+                        usage: None,
+                        model: String::new(),
+                        object: "chat.completion".to_string(),
+                        system_fingerprint: None,
+                        choices: vec![],
+                        service_tier: None,
+                    },
                    nvext: None,
                };
                let _ = tx.send(fallback.clone());
@@ -145,14 +149,16 @@ where
        rx.await.unwrap_or_else(|_| {
            tracing::warn!("fold aggregation future canceled");
            NvCreateChatCompletionResponse {
-                id: String::new(),
-                created: 0,
-                usage: None,
-                model: String::new(),
-                object: "chat.completion".to_string(),
-                system_fingerprint: None,
-                choices: vec![],
-                service_tier: None,
+                inner: dynamo_async_openai::types::CreateChatCompletionResponse {
+                    id: String::new(),
+                    created: 0,
+                    usage: None,
+                    model: String::new(),
+                    object: "chat.completion".to_string(),
+                    system_fingerprint: None,
+                    choices: vec![],
+                    service_tier: None,
+                },
                nvext: None,
            }
        })
@@ -171,8 +177,8 @@ pub fn final_response_to_one_chunk_stream(
 ) -> std::pin::Pin<
    Box<dyn futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>> + Send>,
 > {
-    let mut choices: Vec<ChatChoiceStream> = Vec::with_capacity(resp.choices.len());
-    for (idx, ch) in resp.choices.iter().enumerate() {
+    let mut choices: Vec<ChatChoiceStream> = Vec::with_capacity(resp.inner.choices.len());
+    for (idx, ch) in resp.inner.choices.iter().enumerate() {
        // Convert FunctionCall to FunctionCallStream if present
        #[allow(deprecated)]
        let function_call = ch.message.function_call.as_ref().map(|fc| {
@@ -222,14 +228,16 @@ pub fn final_response_to_one_chunk_stream(
    }

    let chunk = NvCreateChatCompletionStreamResponse {
-        id: resp.id.clone(),
-        object: "chat.completion.chunk".to_string(),
-        created: resp.created,
-        model: resp.model.clone(),
-        system_fingerprint: resp.system_fingerprint.clone(),
-        service_tier: resp.service_tier.clone(),
-        choices,
-        usage: resp.usage.clone(),
+        inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+            id: resp.inner.id.clone(),
+            object: "chat.completion.chunk".to_string(),
+            created: resp.inner.created,
+            model: resp.inner.model.clone(),
+            system_fingerprint: resp.inner.system_fingerprint.clone(),
+            service_tier: resp.inner.service_tier.clone(),
+            choices,
+            usage: resp.inner.usage.clone(),
+        },
        nvext: resp.nvext.clone(),
    };

@@ -275,14 +283,16 @@ mod tests {
        };

        let response = NvCreateChatCompletionStreamResponse {
-            id: "test-id".to_string(),
-            choices: vec![choice],
-            created: 1234567890,
-            model: "test-model".to_string(),
-            system_fingerprint: Some("test-fingerprint".to_string()),
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
-            service_tier: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test-id".to_string(),
+                choices: vec![choice],
+                created: 1234567890,
+                model: "test-model".to_string(),
+                system_fingerprint: Some("test-fingerprint".to_string()),
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+                service_tier: None,
+            },
            nvext: None,
        };

@@ -314,14 +324,16 @@ mod tests {
        };

        let response = NvCreateChatCompletionStreamResponse {
-            id: "test-id".to_string(),
-            choices: vec![choice],
-            created: 1234567890,
-            model: "test-model".to_string(),
-            system_fingerprint: Some("test-fingerprint".to_string()),
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
-            service_tier: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test-id".to_string(),
+                choices: vec![choice],
+                created: 1234567890,
+                model: "test-model".to_string(),
+                system_fingerprint: Some("test-fingerprint".to_string()),
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+                service_tier: None,
+            },
            nvext: None,
        };

@@ -339,7 +351,7 @@ mod tests {
        chunk
            .data
            .as_ref()
-            .and_then(|d| d.choices.first())
+            .and_then(|d| d.inner.choices.first())
            .and_then(|c| c.delta.content.as_ref())
            .and_then(|content| match content {
                ChatCompletionMessageContent::Text(text) => Some(text.clone()),
@@ -396,7 +408,7 @@ mod tests {
        assert_eq!(results.len(), 0, "Empty stream should produce no chunks");

        // Verify fallback response (aggregation will fail on empty stream)
-        assert_eq!(final_resp.object, "chat.completion");
+        assert_eq!(final_resp.inner.object, "chat.completion");
        // Should get fallback response, not panic
    }

@@ -415,7 +427,7 @@ mod tests {
        assert_eq!(extract_content(&results[0]), "Single chunk");

        // Verify aggregation
-        assert_eq!(final_resp.object, "chat.completion");
+        assert_eq!(final_resp.inner.object, "chat.completion");
    }

    #[tokio::test]
@@ -423,32 +435,34 @@ mod tests {
        // Test that metadata (id, event, comment) is preserved through passthrough
        let chunk_with_metadata = Annotated {
            data: Some(NvCreateChatCompletionStreamResponse {
-                id: "test-id".to_string(),
-                choices: vec![{
-                    #[allow(deprecated)]
-                    ChatChoiceStream {
-                        index: 0,
-                        delta: ChatCompletionStreamResponseDelta {
-                            role: Some(Role::Assistant),
-                            content: Some(ChatCompletionMessageContent::Text(
-                                "Content".to_string(),
-                            )),
-                            tool_calls: None,
-                            function_call: None,
-                            refusal: None,
-                            reasoning_content: None,
-                        },
-                        finish_reason: None,
-                        stop_reason: None,
-                        logprobs: None,
-                    }
-                }],
-                created: 1234567890,
-                model: "test-model".to_string(),
-                system_fingerprint: None,
-                object: "chat.completion.chunk".to_string(),
-                usage: None,
-                service_tier: None,
+                inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                    id: "test-id".to_string(),
+                    choices: vec![{
+                        #[allow(deprecated)]
+                        ChatChoiceStream {
+                            index: 0,
+                            delta: ChatCompletionStreamResponseDelta {
+                                role: Some(Role::Assistant),
+                                content: Some(ChatCompletionMessageContent::Text(
+                                    "Content".to_string(),
+                                )),
+                                tool_calls: None,
+                                function_call: None,
+                                refusal: None,
+                                reasoning_content: None,
+                            },
+                            finish_reason: None,
+                            stop_reason: None,
+                            logprobs: None,
+                        }
+                    }],
+                    created: 1234567890,
+                    model: "test-model".to_string(),
+                    system_fingerprint: None,
+                    object: "chat.completion.chunk".to_string(),
+                    usage: None,
+                    service_tier: None,
+                },
                nvext: None,
            }),
            id: Some("correlation-123".to_string()),
@@ -481,7 +495,7 @@ mod tests {
        let (resp1, resp2) = tokio::join!(future1, future2);

        // Both should complete successfully
-        assert_eq!(resp1.object, "chat.completion");
-        assert_eq!(resp2.object, "chat.completion");
+        assert_eq!(resp1.inner.object, "chat.completion");
+        assert_eq!(resp2.inner.object, "chat.completion");
    }
 }
--- a/lib/llm/src/entrypoint/input/batch.rs
+++ b/lib/llm/src/entrypoint/input/batch.rs
@@ -238,8 +238,9 @@ async fn evaluate(
        match (item.data.as_ref(), item.event.as_deref()) {
            (Some(data), _) => {
                // Normal case
-                let choice = data.choices.first();
-                let chat_comp = choice.as_ref().unwrap();
+                let Some(chat_comp) = data.inner.choices.first() else {
+                    continue;
+                };
                if let Some(c) = &chat_comp.delta.content {
                    match c {
                        ChatCompletionMessageContent::Text(text) => {

--- a/lib/llm/src/entrypoint/input/text.rs
+++ b/lib/llm/src/entrypoint/input/text.rs
@@ -138,8 +138,9 @@ async fn main_loop(
            match (item.data.as_ref(), item.event.as_deref()) {
                (Some(data), _) => {
                    // Normal case
-                    let entry = data.choices.first();
-                    let chat_comp = entry.as_ref().unwrap();
+                    let Some(chat_comp) = data.inner.choices.first() else {
+                        continue;
+                    };
                    if let Some(c) = &chat_comp.delta.content {
                        match c {
                            ChatCompletionMessageContent::Text(text) => {

--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -991,7 +991,7 @@ fn streaming_tool_dispatch_events(
    };

    let mut events = vec![];
-    for choice in &data.choices {
+    for choice in &data.inner.choices {
        let Some(tool_calls) = &choice.delta.tool_calls else {
            continue;
        };
@@ -1034,7 +1034,7 @@ fn accumulate_reasoning_dispatch(
    };

    let mut events = vec![];
-    for choice in &data.choices {
+    for choice in &data.inner.choices {
        let buffer = buffers.entry(choice.index).or_default();
        let has_reasoning = choice
            .delta
@@ -2892,15 +2892,17 @@ mod tests {

        // Create a normal data event
        let normal_event = Annotated::<NvCreateChatCompletionStreamResponse> {
-            data: Some(CreateChatCompletionStreamResponse {
-                id: "test-id".to_string(),
-                choices: vec![],
-                created: 0,
-                model: "test-model".to_string(),
-                system_fingerprint: None,
-                object: "chat.completion.chunk".to_string(),
-                service_tier: None,
-                usage: None,
+            data: Some(NvCreateChatCompletionStreamResponse {
+                inner: CreateChatCompletionStreamResponse {
+                    id: "test-id".to_string(),
+                    choices: vec![],
+                    created: 0,
+                    model: "test-model".to_string(),
+                    system_fingerprint: None,
+                    object: "chat.completion.chunk".to_string(),
+                    service_tier: None,
+                    usage: None,
+                },
                nvext: None,
            }),
            id: Some("msg-1".to_string()),
@@ -3162,15 +3164,17 @@ mod tests {
    fn make_stream_response(
        choices: Vec<ChatChoiceStream>,
    ) -> Annotated<NvCreateChatCompletionStreamResponse> {
-        let response = CreateChatCompletionStreamResponse {
-            id: "test-id".to_string(),
-            choices,
-            created: 0,
-            model: "test-model".to_string(),
-            system_fingerprint: None,
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
-            service_tier: None,
+        let response = NvCreateChatCompletionStreamResponse {
+            inner: CreateChatCompletionStreamResponse {
+                id: "test-id".to_string(),
+                choices,
+                created: 0,
+                model: "test-model".to_string(),
+                system_fingerprint: None,
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+                service_tier: None,
+            },
            nvext: None,
        };
        Annotated {

--- a/lib/llm/src/perf/logprobs.rs
+++ b/lib/llm/src/perf/logprobs.rs
@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
    fn extract_logprobs_by_choice(&self) -> HashMap<u32, Vec<TokenLogProbs>> {
        let mut result = HashMap::new();

-        for choice in &self.choices {
+        for choice in &self.inner.choices {
            let choice_index = choice.index;

            let choice_logprobs = choice
@@ -949,34 +949,36 @@ mod tests {
    ) -> NvCreateChatCompletionStreamResponse {
        #[expect(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            choices: vec![ChatChoiceStream {
-                index: 0,
-                delta: ChatCompletionStreamResponseDelta {
-                    content: Some(
-                        dynamo_async_openai::types::ChatCompletionMessageContent::Text(
-                            "test".to_string(),
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                choices: vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: Some(
+                            dynamo_async_openai::types::ChatCompletionMessageContent::Text(
+                                "test".to_string(),
+                            ),
                        ),
-                    ),
-                    function_call: None,
-                    tool_calls: None,
-                    role: Some(Role::Assistant),
-                    refusal: None,
-                    reasoning_content: None,
-                },
-                finish_reason: Some(FinishReason::Stop),
-                stop_reason: None,
-                logprobs: Some(ChatChoiceLogprobs {
-                    content: Some(token_logprobs),
-                    refusal: None,
-                }),
-            }],
-            created: 1234567890,
-            model: "test-model".to_string(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
+                        function_call: None,
+                        tool_calls: None,
+                        role: Some(Role::Assistant),
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: Some(FinishReason::Stop),
+                    stop_reason: None,
+                    logprobs: Some(ChatChoiceLogprobs {
+                        content: Some(token_logprobs),
+                        refusal: None,
+                    }),
+                }],
+                created: 1234567890,
+                model: "test-model".to_string(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+            },
            nvext: None,
        }
    }
@@ -1012,14 +1014,16 @@ mod tests {
            .collect();

        NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            choices,
-            created: 1234567890,
-            model: "test-model".to_string(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                choices,
+                created: 1234567890,
+                model: "test-model".to_string(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+            },
            nvext: None,
        }
    }
@@ -1341,31 +1345,33 @@ mod tests {
        // Test with choice that has no logprobs
        #[expect(deprecated)]
        let response = NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            choices: vec![ChatChoiceStream {
-                index: 0,
-                delta: ChatCompletionStreamResponseDelta {
-                    content: Some(
-                        dynamo_async_openai::types::ChatCompletionMessageContent::Text(
-                            "test".to_string(),
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                choices: vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: Some(
+                            dynamo_async_openai::types::ChatCompletionMessageContent::Text(
+                                "test".to_string(),
+                            ),
                        ),
-                    ),
-                    function_call: None,
-                    tool_calls: None,
-                    role: Some(Role::Assistant),
-                    refusal: None,
-                    reasoning_content: None,
-                },
-                finish_reason: Some(FinishReason::Stop),
-                stop_reason: None,
-                logprobs: None, // No logprobs
-            }],
-            created: 1234567890,
-            model: "test-model".to_string(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
+                        function_call: None,
+                        tool_calls: None,
+                        role: Some(Role::Assistant),
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: Some(FinishReason::Stop),
+                    stop_reason: None,
+                    logprobs: None, // No logprobs
+                }],
+                created: 1234567890,
+                model: "test-model".to_string(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+            },
            nvext: None,
        };

@@ -1573,14 +1579,16 @@ mod tests {
        // In practice, this would have real logprobs data

        NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            choices: vec![],
-            created: 1234567890,
-            model: "test-model".to_string(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".to_string(),
-            usage: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                choices: vec![],
+                created: 1234567890,
+                model: "test-model".to_string(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".to_string(),
+                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -1217,7 +1217,7 @@ impl OpenAIPreprocessor {
                let processed_response = if let Some(ref mut parser) = state.reasoning_parser {
                    response.map_data(|mut data| {
                        // Process all choices, not just the first one
-                        for choice in data.choices.iter_mut() {
+                        for choice in data.inner.choices.iter_mut() {
                            // Reasoning parsing only applies to text content
                            if let Some(
                                dynamo_async_openai::types::ChatCompletionMessageContent::Text(

--- a/lib/llm/src/preprocessor/speculative_prefill.rs
+++ b/lib/llm/src/preprocessor/speculative_prefill.rs
@@ -111,7 +111,7 @@ pub fn maybe_wrap_stream(
    let mut prefill_tx = Some(tx);
    Box::pin(stream.map(move |item| {
        if let Some(ref resp) = item.data {
-            for choice in &resp.choices {
+            for choice in &resp.inner.choices {
                if let Some(ChatCompletionMessageContent::Text(ref text)) = choice.delta.content {
                    accumulated_text.push_str(text);
                }

--- a/lib/llm/src/protocols/anthropic/stream_converter.rs
+++ b/lib/llm/src/protocols/anthropic/stream_converter.rs
@@ -106,7 +106,7 @@ impl AnthropicStreamConverter {
        let mut events = Vec::new();

        // Capture real token usage from engine when available (typically on the final chunk).
-        if let Some(usage) = &chunk.usage {
+        if let Some(usage) = &chunk.inner.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
            self.cached_token_count = usage
@@ -115,7 +115,7 @@ impl AnthropicStreamConverter {
                .and_then(|d| d.cached_tokens);
        }

-        for choice in &chunk.choices {
+        for choice in &chunk.inner.choices {
            let delta = &choice.delta;

            // Track finish reason
@@ -444,7 +444,7 @@ impl AnthropicStreamConverter {
    ) -> Vec<TaggedEvent> {
        let mut events = Vec::new();

-        if let Some(usage) = &chunk.usage {
+        if let Some(usage) = &chunk.inner.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
            self.cached_token_count = usage
@@ -453,7 +453,7 @@ impl AnthropicStreamConverter {
                .and_then(|d| d.cached_tokens);
        }

-        for choice in &chunk.choices {
+        for choice in &chunk.inner.choices {
            let delta = &choice.delta;

            if let Some(ref fr) = choice.finish_reason {
@@ -722,27 +722,29 @@ mod tests {
    fn text_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "chat-1".into(),
-            choices: vec![ChatChoiceStream {
-                index: 0,
-                delta: ChatCompletionStreamResponseDelta {
-                    content: Some(ChatCompletionMessageContent::Text(text.into())),
-                    function_call: None,
-                    tool_calls: None,
-                    role: None,
-                    refusal: None,
-                    reasoning_content: None,
-                },
-                finish_reason: None,
-                stop_reason: None,
-                logprobs: None,
-            }],
-            created: 0,
-            model: "test".into(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".into(),
-            usage: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "chat-1".into(),
+                choices: vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: Some(ChatCompletionMessageContent::Text(text.into())),
+                        function_call: None,
+                        tool_calls: None,
+                        role: None,
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: None,
+                    stop_reason: None,
+                    logprobs: None,
+                }],
+                created: 0,
+                model: "test".into(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }
@@ -755,35 +757,37 @@ mod tests {
    ) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "chat-1".into(),
-            choices: vec![ChatChoiceStream {
-                index: 0,
-                delta: ChatCompletionStreamResponseDelta {
-                    content: None,
-                    function_call: None,
-                    tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
-                        index: tc_index,
-                        id: id.map(String::from),
-                        r#type: Some(ChatCompletionToolType::Function),
-                        function: Some(FunctionCallStream {
-                            name: name.map(String::from),
-                            arguments: args.map(String::from),
-                        }),
-                    }]),
-                    role: None,
-                    refusal: None,
-                    reasoning_content: None,
-                },
-                finish_reason: None,
-                stop_reason: None,
-                logprobs: None,
-            }],
-            created: 0,
-            model: "test".into(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".into(),
-            usage: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "chat-1".into(),
+                choices: vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: None,
+                        function_call: None,
+                        tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
+                            index: tc_index,
+                            id: id.map(String::from),
+                            r#type: Some(ChatCompletionToolType::Function),
+                            function: Some(FunctionCallStream {
+                                name: name.map(String::from),
+                                arguments: args.map(String::from),
+                            }),
+                        }]),
+                        role: None,
+                        refusal: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: None,
+                    stop_reason: None,
+                    logprobs: None,
+                }],
+                created: 0,
+                model: "test".into(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }
@@ -908,27 +912,29 @@ mod tests {
    fn reasoning_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
        #[allow(deprecated)]
        NvCreateChatCompletionStreamResponse {
-            id: "chat-1".into(),
-            choices: vec![ChatChoiceStream {
-                index: 0,
-                delta: ChatCompletionStreamResponseDelta {
-                    content: None,
-                    function_call: None,
-                    tool_calls: None,
-                    role: None,
-                    refusal: None,
-                    reasoning_content: Some(text.into()),
-                },
-                finish_reason: None,
-                stop_reason: None,
-                logprobs: None,
-            }],
-            created: 0,
-            model: "test".into(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion.chunk".into(),
-            usage: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "chat-1".into(),
+                choices: vec![ChatChoiceStream {
+                    index: 0,
+                    delta: ChatCompletionStreamResponseDelta {
+                        content: None,
+                        function_call: None,
+                        tool_calls: None,
+                        role: None,
+                        refusal: None,
+                        reasoning_content: Some(text.into()),
+                    },
+                    finish_reason: None,
+                    stop_reason: None,
+                    logprobs: None,
+                }],
+                created: 0,
+                model: "test".into(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion.chunk".into(),
+                usage: None,
+            },
            nvext: None,
        }
    }

--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0

-//! Anthropic Messages API types and conversion logic.
+//! Anthropic Messages API conversion logic.
 //!
-//! All request/response types for the `/v1/messages` endpoint, plus
-//! bidirectional conversion to/from the internal chat completions format.
+//! Pure protocol types live in `dynamo_async_openai::types::anthropic`.
+//! This module provides bidirectional conversion to/from the internal
+//! chat completions format used by the Dynamo engine.
+
+// Re-export all pure Anthropic protocol types so existing `use crate::protocols::anthropic::*`
+// continues to work throughout dynamo-llm.
+pub use dynamo_async_openai::types::anthropic::*;

 use dynamo_async_openai::types::{
    ChatCompletionMessageToolCall, ChatCompletionNamedToolChoice,
@@ -17,764 +22,13 @@ use dynamo_async_openai::types::{
    ChatCompletionTool, ChatCompletionToolChoiceOption, ChatCompletionToolType, FunctionName,
    FunctionObject, ImageUrl, ReasoningContent,
 };
-use serde::{Deserialize, Serialize};
 use uuid::Uuid;

 use crate::protocols::openai::chat_completions::{
    NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
 };
 use crate::protocols::openai::common_ext::CommonExt;
-use crate::protocols::openai::nvext::{CacheControl, NvExt};
-
-// ---------------------------------------------------------------------------
-// Custom deserializers
-// ---------------------------------------------------------------------------
-
-/// Parsed system prompt content, preserving cache_control from block arrays.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct SystemContent {
-    /// The concatenated text from all system blocks (or the plain string).
-    pub text: String,
-    /// Cache control from the last system block that had one.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub cache_control: Option<CacheControl>,
-}
-
-/// Deserialize `system` from either a plain string or an array of text blocks.
-/// The Anthropic API accepts both `"system": "text"` and
-/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
-fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
-where
-    D: serde::Deserializer<'de>,
-{
-    #[derive(Deserialize)]
-    #[serde(untagged)]
-    enum SystemPrompt {
-        Text(String),
-        Blocks(Vec<SystemBlock>),
-    }
-
-    #[derive(Deserialize)]
-    struct SystemBlock {
-        text: String,
-        #[serde(default)]
-        cache_control: Option<CacheControl>,
-    }
-
-    let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
-    Ok(maybe.map(|sp| match sp {
-        SystemPrompt::Text(s) => SystemContent {
-            text: s,
-            cache_control: None,
-        },
-        SystemPrompt::Blocks(blocks) => {
-            let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
-            let text = blocks
-                .into_iter()
-                .map(|b| b.text)
-                .collect::<Vec<_>>()
-                .join("\n");
-            SystemContent {
-                text,
-                cache_control,
-            }
-        }
-    }))
-}
-
-// ---------------------------------------------------------------------------
-// Request types
-// ---------------------------------------------------------------------------
-
-/// Top-level request body for `POST /v1/messages`.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicCreateMessageRequest {
-    /// The model to use (e.g. "claude-sonnet-4-20250514").
-    pub model: String,
-
-    /// The maximum number of tokens to generate.
-    pub max_tokens: u32,
-
-    /// The conversation messages.
-    pub messages: Vec<AnthropicMessage>,
-
-    /// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
-    #[serde(
-        default,
-        skip_serializing_if = "Option::is_none",
-        deserialize_with = "deserialize_system_prompt"
-    )]
-    pub system: Option<SystemContent>,
-
-    /// Sampling temperature (0.0 - 1.0).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub temperature: Option<f32>,
-
-    /// Nucleus sampling parameter.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_p: Option<f32>,
-
-    /// Top-K sampling parameter.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub top_k: Option<u32>,
-
-    /// Custom stop sequences.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop_sequences: Option<Vec<String>>,
-
-    /// Whether to stream the response.
-    #[serde(default)]
-    pub stream: bool,
-
-    /// Optional metadata (e.g. user_id).
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub metadata: Option<serde_json::Value>,
-
-    /// Tools the model may call.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tools: Option<Vec<AnthropicTool>>,
-
-    /// How the model should choose which tool to call.
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_choice: Option<AnthropicToolChoice>,
-
-    /// Top-level cache control for automatic prompt prefix caching.
-    /// When present, the system caches all content up to the last cacheable block.
-    /// Matches the Anthropic Messages API automatic caching mode.
-    /// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub cache_control: Option<CacheControl>,
-
-    /// Extended thinking configuration. When enabled, the model produces
-    /// `thinking` content blocks containing its internal reasoning before
-    /// the final response. The `budget_tokens` field controls how many tokens
-    /// the model may use for thinking (must be ≥ 1024 and < max_tokens).
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub thinking: Option<ThinkingConfig>,
-
-    /// Service tier selection: `"auto"` or `"standard_only"`.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub service_tier: Option<String>,
-
-    /// Container identifier for stateful sandbox sessions.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub container: Option<String>,
-
-    /// Output configuration: effort level and optional JSON schema format.
-    /// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
-    /// `format` specifies structured JSON output constraints.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub output_config: Option<serde_json::Value>,
-}
-
-/// Extended thinking configuration for the request.
-///
-/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
-/// with its internal reasoning. `budget_tokens` controls the maximum tokens
-/// available for thinking (minimum 1024, must be less than `max_tokens`).
-/// When `type` is `"disabled"`, no thinking blocks are produced.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ThinkingConfig {
-    /// Either `"enabled"` or `"disabled"`.
-    #[serde(rename = "type")]
-    pub thinking_type: String,
-    /// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub budget_tokens: Option<u32>,
-}
-
-/// A single message in the conversation.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicMessage {
-    pub role: AnthropicRole,
-    #[serde(flatten)]
-    pub content: AnthropicMessageContent,
-}
-
-/// The role of a message sender.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum AnthropicRole {
-    User,
-    Assistant,
-}
-
-/// Message content — either a plain string or an array of content blocks.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum AnthropicMessageContent {
-    /// Plain text content.
-    Text { content: String },
-    /// Array of structured content blocks.
-    Blocks { content: Vec<AnthropicContentBlock> },
-}
-
-/// A single content block within a message.
-///
-/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
-/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
-/// of causing a hard deserialization failure. This is important because Claude
-/// Code may send block types that we don't yet handle.
-#[derive(Debug, Clone, Serialize)]
-#[serde(tag = "type")]
-pub enum AnthropicContentBlock {
-    /// Text content block. May optionally include `citations` — references to
-    /// source documents that support the text content. Citations are generated
-    /// by the model when document/PDF content is provided and citation mode is enabled.
-    #[serde(rename = "text")]
-    Text {
-        text: String,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        citations: Option<Vec<serde_json::Value>>,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        cache_control: Option<CacheControl>,
-    },
-    /// Image content block.
-    #[serde(rename = "image")]
-    Image { source: AnthropicImageSource },
-    /// Tool use request from assistant.
-    #[serde(rename = "tool_use")]
-    ToolUse {
-        id: String,
-        name: String,
-        input: serde_json::Value,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        cache_control: Option<CacheControl>,
-    },
-    /// Tool result from user.
-    #[serde(rename = "tool_result")]
-    ToolResult {
-        tool_use_id: String,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        content: Option<ToolResultContent>,
-        #[serde(skip_serializing_if = "Option::is_none")]
-        is_error: Option<bool>,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        cache_control: Option<CacheControl>,
-    },
-    /// Thinking content block from assistant (extended thinking / reasoning).
-    #[serde(rename = "thinking")]
-    Thinking {
-        thinking: String,
-        signature: String,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        cache_control: Option<CacheControl>,
-    },
-    /// Redacted thinking block from assistant. Contains encrypted reasoning data
-    /// that is opaque to the client but must be passed back verbatim in multi-turn
-    /// conversations so the model can maintain its chain of thought.
-    #[serde(rename = "redacted_thinking")]
-    RedactedThinking { data: String },
-    /// Server-initiated tool use block. Represents a tool call that the API
-    /// executes server-side (e.g., web search). The client receives the result
-    /// via a corresponding `web_search_tool_result` or similar block.
-    #[serde(rename = "server_tool_use")]
-    ServerToolUse {
-        id: String,
-        name: String,
-        #[serde(default)]
-        input: serde_json::Value,
-    },
-    /// Result from a server-initiated tool (e.g., web search results).
-    /// Contains structured content returned by the server-side tool execution.
-    #[serde(rename = "web_search_tool_result")]
-    WebSearchToolResult {
-        tool_use_id: String,
-        #[serde(default)]
-        content: serde_json::Value,
-    },
-    /// Catch-all for unrecognized block types. Preserves the full JSON value
-    /// so that new Anthropic features don't break the endpoint and can be
-    /// round-tripped or inspected.
-    #[serde(untagged)]
-    Other(serde_json::Value),
-}
-
-/// Content of a `tool_result` block — either a plain string or an array of
-/// content blocks (the Anthropic API accepts both).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum ToolResultContent {
-    Text(String),
-    Blocks(Vec<ToolResultContentBlock>),
-}
-
-impl ToolResultContent {
-    /// Extract the text content, concatenating array blocks if needed.
-    pub fn into_text(self) -> String {
-        match self {
-            ToolResultContent::Text(s) => s,
-            ToolResultContent::Blocks(blocks) => blocks
-                .into_iter()
-                .filter_map(|b| match b {
-                    ToolResultContentBlock::Text { text } => Some(text),
-                    ToolResultContentBlock::Other(_) => None,
-                })
-                .collect::<Vec<_>>()
-                .join(""),
-        }
-    }
-}
-
-/// A content block within a `tool_result.content` array.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum ToolResultContentBlock {
-    Text {
-        text: String,
-    },
-    /// Catch-all for non-text blocks (images, etc.) in tool results.
-    Other(serde_json::Value),
-}
-
-/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
-/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
-/// tagged enums, we deserialize as `Value` first and dispatch manually.
-impl<'de> Deserialize<'de> for AnthropicContentBlock {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let value = serde_json::Value::deserialize(deserializer)?;
-        let block_type = value
-            .get("type")
-            .and_then(|t| t.as_str())
-            .unwrap_or("")
-            .to_string();
-
-        match block_type.as_str() {
-            "text" => {
-                let text = value
-                    .get("text")
-                    .and_then(|t| t.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("text"))?
-                    .to_string();
-                let citations: Option<Vec<serde_json::Value>> = value
-                    .get("citations")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                let cache_control: Option<CacheControl> = value
-                    .get("cache_control")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                Ok(AnthropicContentBlock::Text {
-                    text,
-                    citations,
-                    cache_control,
-                })
-            }
-            "image" => {
-                let source: AnthropicImageSource =
-                    serde_json::from_value(value.get("source").cloned().unwrap_or_default())
-                        .map_err(serde::de::Error::custom)?;
-                Ok(AnthropicContentBlock::Image { source })
-            }
-            "tool_use" => {
-                let id = value
-                    .get("id")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
-                    .to_string();
-                let name = value
-                    .get("name")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
-                    .to_string();
-                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
-                let cache_control: Option<CacheControl> = value
-                    .get("cache_control")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                Ok(AnthropicContentBlock::ToolUse {
-                    id,
-                    name,
-                    input,
-                    cache_control,
-                })
-            }
-            "tool_result" => {
-                let tool_use_id = value
-                    .get("tool_use_id")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
-                    .to_string();
-                let content: Option<ToolResultContent> = value
-                    .get("content")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                let is_error = value.get("is_error").and_then(|v| v.as_bool());
-                let cache_control: Option<CacheControl> = value
-                    .get("cache_control")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                Ok(AnthropicContentBlock::ToolResult {
-                    tool_use_id,
-                    content,
-                    is_error,
-                    cache_control,
-                })
-            }
-            "thinking" => {
-                let thinking = value
-                    .get("thinking")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("thinking"))?
-                    .to_string();
-                let signature = value
-                    .get("signature")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("signature"))?
-                    .to_string();
-                let cache_control: Option<CacheControl> = value
-                    .get("cache_control")
-                    .cloned()
-                    .and_then(|v| serde_json::from_value(v).ok());
-                Ok(AnthropicContentBlock::Thinking {
-                    thinking,
-                    signature,
-                    cache_control,
-                })
-            }
-            "redacted_thinking" => {
-                let data = value
-                    .get("data")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("data"))?
-                    .to_string();
-                Ok(AnthropicContentBlock::RedactedThinking { data })
-            }
-            "server_tool_use" => {
-                let id = value
-                    .get("id")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
-                    .to_string();
-                let name = value
-                    .get("name")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
-                    .to_string();
-                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
-                Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
-            }
-            "web_search_tool_result" => {
-                let tool_use_id = value
-                    .get("tool_use_id")
-                    .and_then(|v| v.as_str())
-                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
-                    .to_string();
-                let content = value
-                    .get("content")
-                    .cloned()
-                    .unwrap_or(serde_json::json!([]));
-                Ok(AnthropicContentBlock::WebSearchToolResult {
-                    tool_use_id,
-                    content,
-                })
-            }
-            other => {
-                tracing::debug!(
-                    "Unrecognized Anthropic content block type '{}', preserving as Other",
-                    other
-                );
-                Ok(AnthropicContentBlock::Other(value))
-            }
-        }
-    }
-}
-
-/// Image source for image content blocks.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicImageSource {
-    #[serde(rename = "type")]
-    pub source_type: String,
-    pub media_type: String,
-    pub data: String,
-}
-
-/// A tool definition.
-///
-/// Client tools (custom) require `name` + `input_schema`. Server tools
-/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
-/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
-/// `input_schema`. We keep all fields optional beyond `name` so both
-/// kinds deserialize successfully and pass through to the backend.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicTool {
-    /// Tool name (required for client tools, present on server tools too).
-    pub name: String,
-    /// Tool type discriminator. Client tools use `"custom"` (or omit).
-    /// Server tools use versioned types like `"web_search_20260209"`.
-    #[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
-    pub tool_type: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub description: Option<String>,
-    /// JSON Schema for the tool input. Required for client tools, absent on
-    /// server tools (which define their own input shape server-side).
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub input_schema: Option<serde_json::Value>,
-    /// Cache control breakpoint on this tool definition.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub cache_control: Option<CacheControl>,
-}
-
-/// Tool choice specification.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(untagged)]
-pub enum AnthropicToolChoice {
-    /// Named tool: `{type: "tool", name: "..."}`
-    /// Must be listed before Simple so serde tries the stricter shape first.
-    Named(AnthropicToolChoiceNamed),
-    /// Simple mode: "auto", "any", or "none".
-    Simple(AnthropicToolChoiceSimple),
-}
-
-/// Simple tool choice modes.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicToolChoiceSimple {
-    #[serde(rename = "type")]
-    pub choice_type: AnthropicToolChoiceMode,
-    /// When true, the model will call tools one at a time instead of
-    /// potentially issuing multiple tool calls in a single response.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub disable_parallel_tool_use: Option<bool>,
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "lowercase")]
-pub enum AnthropicToolChoiceMode {
-    Auto,
-    Any,
-    None,
-    Tool,
-}
-
-/// Named tool choice.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicToolChoiceNamed {
-    #[serde(rename = "type")]
-    pub choice_type: AnthropicToolChoiceMode,
-    pub name: String,
-    /// When true, the model will call tools one at a time instead of
-    /// potentially issuing multiple tool calls in a single response.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub disable_parallel_tool_use: Option<bool>,
-}
-
-// ---------------------------------------------------------------------------
-// Response types
-// ---------------------------------------------------------------------------
-
-/// Response body for `POST /v1/messages` (non-streaming).
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicMessageResponse {
-    pub id: String,
-    #[serde(rename = "type")]
-    pub object_type: String,
-    pub role: String,
-    pub content: Vec<AnthropicResponseContentBlock>,
-    pub model: String,
-    pub stop_reason: Option<AnthropicStopReason>,
-    pub stop_sequence: Option<String>,
-    pub usage: AnthropicUsage,
-}
-
-/// A content block in the response.
-///
-/// The Anthropic API returns up to 12 different block types. We model the
-/// common ones explicitly and catch the rest as `Other` so the proxy can
-/// forward them without losing data.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type")]
-pub enum AnthropicResponseContentBlock {
-    #[serde(rename = "thinking")]
-    Thinking { thinking: String, signature: String },
-    #[serde(rename = "text")]
-    Text {
-        text: String,
-        #[serde(default, skip_serializing_if = "Option::is_none")]
-        citations: Option<Vec<serde_json::Value>>,
-    },
-    #[serde(rename = "tool_use")]
-    ToolUse {
-        id: String,
-        name: String,
-        input: serde_json::Value,
-    },
-    #[serde(rename = "redacted_thinking")]
-    RedactedThinking { data: String },
-    #[serde(rename = "server_tool_use")]
-    ServerToolUse {
-        id: String,
-        name: String,
-        #[serde(default)]
-        input: serde_json::Value,
-    },
-    #[serde(rename = "web_search_tool_result")]
-    WebSearchToolResult {
-        tool_use_id: String,
-        #[serde(default)]
-        content: serde_json::Value,
-    },
-    /// Catch-all for new/uncommon block types (web_fetch_tool_result,
-    /// code_execution_tool_result, container_upload, etc.) so the proxy
-    /// can serialize them back without data loss.
-    #[serde(untagged)]
-    Other(serde_json::Value),
-}
-
-/// Token usage information.
-#[derive(Debug, Clone, Serialize, Deserialize, Default)]
-pub struct AnthropicUsage {
-    pub input_tokens: u32,
-    pub output_tokens: u32,
-    /// Number of input tokens used to create a new cache entry.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub cache_creation_input_tokens: Option<u32>,
-    /// Number of input tokens read from the prompt cache (prefix cache hits).
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub cache_read_input_tokens: Option<u32>,
-}
-
-/// Reason the model stopped generating.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum AnthropicStopReason {
-    EndTurn,
-    MaxTokens,
-    StopSequence,
-    ToolUse,
-    /// The model paused to yield control in an agentic loop, intending to
-    /// continue in a subsequent turn. Used with extended thinking / tool use.
-    PauseTurn,
-    /// The model refused to generate content (safety refusal).
-    Refusal,
-}
-
-// ---------------------------------------------------------------------------
-// Streaming types
-// ---------------------------------------------------------------------------
-
-/// SSE event types for the Anthropic streaming API.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type")]
-pub enum AnthropicStreamEvent {
-    #[serde(rename = "message_start")]
-    MessageStart { message: AnthropicMessageResponse },
-
-    #[serde(rename = "content_block_start")]
-    ContentBlockStart {
-        index: u32,
-        content_block: AnthropicResponseContentBlock,
-    },
-
-    #[serde(rename = "content_block_delta")]
-    ContentBlockDelta { index: u32, delta: AnthropicDelta },
-
-    #[serde(rename = "content_block_stop")]
-    ContentBlockStop { index: u32 },
-
-    #[serde(rename = "message_delta")]
-    MessageDelta {
-        delta: AnthropicMessageDeltaBody,
-        usage: AnthropicUsage,
-    },
-
-    #[serde(rename = "message_stop")]
-    MessageStop {},
-
-    #[serde(rename = "ping")]
-    Ping {},
-
-    #[serde(rename = "error")]
-    Error { error: AnthropicErrorBody },
-}
-
-/// Delta content in a streaming content_block_delta event.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(tag = "type")]
-pub enum AnthropicDelta {
-    #[serde(rename = "thinking_delta")]
-    ThinkingDelta { thinking: String },
-    #[serde(rename = "text_delta")]
-    TextDelta { text: String },
-    #[serde(rename = "input_json_delta")]
-    InputJsonDelta { partial_json: String },
-    /// Incremental signature for a thinking block (sent at the end).
-    #[serde(rename = "signature_delta")]
-    SignatureDelta { signature: String },
-    /// Incremental citation attached to a text block.
-    #[serde(rename = "citations_delta")]
-    CitationsDelta { citation: serde_json::Value },
-}
-
-/// The delta body in a message_delta event.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicMessageDeltaBody {
-    pub stop_reason: Option<AnthropicStopReason>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub stop_sequence: Option<String>,
-}
-
-// ---------------------------------------------------------------------------
-// Error types
-// ---------------------------------------------------------------------------
-
-/// Anthropic API error response wrapper.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicErrorResponse {
-    #[serde(rename = "type")]
-    pub object_type: String,
-    pub error: AnthropicErrorBody,
-}
-
-/// Error body within an error response.
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct AnthropicErrorBody {
-    #[serde(rename = "type")]
-    pub error_type: String,
-    pub message: String,
-}
-
-impl AnthropicErrorResponse {
-    /// Create an `invalid_request_error` response.
-    pub fn invalid_request(message: impl Into<String>) -> Self {
-        Self {
-            object_type: "error".to_string(),
-            error: AnthropicErrorBody {
-                error_type: "invalid_request_error".to_string(),
-                message: message.into(),
-            },
-        }
-    }
-
-    /// Create an `api_error` (internal server error) response.
-    pub fn api_error(message: impl Into<String>) -> Self {
-        Self {
-            object_type: "error".to_string(),
-            error: AnthropicErrorBody {
-                error_type: "api_error".to_string(),
-                message: message.into(),
-            },
-        }
-    }
-
-    /// Create a `not_found_error` response.
-    pub fn not_found(message: impl Into<String>) -> Self {
-        Self {
-            object_type: "error".to_string(),
-            error: AnthropicErrorBody {
-                error_type: "not_found_error".to_string(),
-                message: message.into(),
-            },
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Conversion: AnthropicCreateMessageRequest -> NvCreateChatCompletionRequest
-// ---------------------------------------------------------------------------
-
+use crate::protocols::openai::nvext::NvExt;
 impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
    type Error = anyhow::Error;

@@ -1199,11 +453,6 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
        }
    }
 }
-
-// ---------------------------------------------------------------------------
-// Conversion: NvCreateChatCompletionResponse -> AnthropicMessageResponse
-// ---------------------------------------------------------------------------
-
 /// Convert a completed chat completion response into an Anthropic Messages response.
 pub fn chat_completion_to_anthropic_response(
    chat_resp: NvCreateChatCompletionResponse,
@@ -1211,7 +460,7 @@ pub fn chat_completion_to_anthropic_response(
 ) -> AnthropicMessageResponse {
    let msg_id = format!("msg_{}", Uuid::new_v4().simple());

-    let choice = chat_resp.choices.into_iter().next();
+    let choice = chat_resp.inner.choices.into_iter().next();
    let mut content = Vec::new();
    let mut stop_reason = None;

@@ -1282,6 +531,7 @@ pub fn chat_completion_to_anthropic_response(

    // Map usage
    let usage = chat_resp
+        .inner
        .usage
        .map(|u| {
            let cache_read_input_tokens = u
@@ -1308,111 +558,6 @@ pub fn chat_completion_to_anthropic_response(
        usage,
    }
 }
-
-// ---------------------------------------------------------------------------
-// Count tokens
-// ---------------------------------------------------------------------------
-
-/// Request body for `POST /v1/messages/count_tokens`.
-#[derive(Debug, Clone, Deserialize)]
-pub struct AnthropicCountTokensRequest {
-    pub model: String,
-    pub messages: Vec<AnthropicMessage>,
-    #[serde(
-        default,
-        skip_serializing_if = "Option::is_none",
-        deserialize_with = "deserialize_system_prompt"
-    )]
-    pub system: Option<SystemContent>,
-    #[serde(default)]
-    pub tools: Option<Vec<AnthropicTool>>,
-}
-
-/// Response body for `POST /v1/messages/count_tokens`.
-#[derive(Debug, Clone, Serialize)]
-pub struct AnthropicCountTokensResponse {
-    pub input_tokens: u32,
-}
-
-impl AnthropicCountTokensRequest {
-    /// Estimate input token count using a `len/3` heuristic.
-    pub fn estimate_tokens(&self) -> u32 {
-        let mut total_len: usize = 0;
-
-        if let Some(system) = &self.system {
-            total_len += system.text.len();
-        }
-
-        for msg in &self.messages {
-            // Count role
-            total_len += match msg.role {
-                AnthropicRole::User => 4,
-                AnthropicRole::Assistant => 9,
-            };
-            // Count content
-            match &msg.content {
-                AnthropicMessageContent::Text { content } => total_len += content.len(),
-                AnthropicMessageContent::Blocks { content } => {
-                    for block in content {
-                        total_len += estimate_block_len(block);
-                    }
-                }
-            }
-        }
-
-        if let Some(tools) = &self.tools {
-            for tool in tools {
-                total_len += tool.name.len();
-                if let Some(desc) = &tool.description {
-                    total_len += desc.len();
-                }
-                if let Some(schema) = &tool.input_schema {
-                    total_len += schema.to_string().len();
-                }
-            }
-        }
-
-        let tokens = total_len / 3;
-        if tokens == 0 && total_len > 0 {
-            1
-        } else {
-            tokens as u32
-        }
-    }
-}
-
-fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
-    match block {
-        AnthropicContentBlock::Text { text, .. } => text.len(),
-        AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
-        AnthropicContentBlock::ToolResult { content, .. } => content
-            .as_ref()
-            .map(|c| match c {
-                ToolResultContent::Text(s) => s.len(),
-                ToolResultContent::Blocks(blocks) => blocks
-                    .iter()
-                    .map(|b| match b {
-                        ToolResultContentBlock::Text { text } => text.len(),
-                        ToolResultContentBlock::Other(v) => v.to_string().len(),
-                    })
-                    .sum(),
-            })
-            .unwrap_or(0),
-        AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
-        AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
-        AnthropicContentBlock::ServerToolUse { name, input, .. } => {
-            name.len() + input.to_string().len()
-        }
-        AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
-        AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
-        AnthropicContentBlock::Other(v) => v.to_string().len(),
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1656,38 +801,40 @@ mod tests {
    #[test]
    fn test_chat_completion_to_anthropic_response() {
        let chat_resp = NvCreateChatCompletionResponse {
-            id: "chatcmpl-xyz".into(),
-            choices: vec![dynamo_async_openai::types::ChatChoice {
-                index: 0,
-                message: dynamo_async_openai::types::ChatCompletionResponseMessage {
-                    content: Some(
-                        dynamo_async_openai::types::ChatCompletionMessageContent::Text(
-                            "Hello!".to_string(),
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
+                id: "chatcmpl-xyz".into(),
+                choices: vec![dynamo_async_openai::types::ChatChoice {
+                    index: 0,
+                    message: dynamo_async_openai::types::ChatCompletionResponseMessage {
+                        content: Some(
+                            dynamo_async_openai::types::ChatCompletionMessageContent::Text(
+                                "Hello!".to_string(),
+                            ),
                        ),
-                    ),
-                    refusal: None,
-                    tool_calls: None,
-                    role: dynamo_async_openai::types::Role::Assistant,
-                    function_call: None,
-                    audio: None,
-                    reasoning_content: None,
-                },
-                finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
-                stop_reason: None,
-                logprobs: None,
-            }],
-            created: 1726000000,
-            model: "test-model".into(),
-            service_tier: None,
-            system_fingerprint: None,
-            object: "chat.completion".to_string(),
-            usage: Some(dynamo_async_openai::types::CompletionUsage {
-                prompt_tokens: 10,
-                completion_tokens: 5,
-                total_tokens: 15,
-                prompt_tokens_details: None,
-                completion_tokens_details: None,
-            }),
+                        refusal: None,
+                        tool_calls: None,
+                        role: dynamo_async_openai::types::Role::Assistant,
+                        function_call: None,
+                        audio: None,
+                        reasoning_content: None,
+                    },
+                    finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
+                    stop_reason: None,
+                    logprobs: None,
+                }],
+                created: 1726000000,
+                model: "test-model".into(),
+                service_tier: None,
+                system_fingerprint: None,
+                object: "chat.completion".to_string(),
+                usage: Some(dynamo_async_openai::types::CompletionUsage {
+                    prompt_tokens: 10,
+                    completion_tokens: 5,
+                    total_tokens: 15,
+                    prompt_tokens_details: None,
+                    completion_tokens_details: None,
+                }),
+            },
            nvext: None,
        };


--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -64,21 +64,24 @@ pub struct NvCreateChatCompletionRequest {
 }

 /// A response structure for unary chat completion responses, embedding OpenAI's
-/// `CreateChatCompletionResponse`.
-///
-/// # Fields
-/// - `inner`: The base OpenAI unary chat completion response, embedded
-///   using `serde(flatten)`.
-pub type NvCreateChatCompletionResponse = dynamo_async_openai::types::CreateChatCompletionResponse;
+/// `CreateChatCompletionResponse` with optional NVIDIA extension metadata.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+pub struct NvCreateChatCompletionResponse {
+    #[serde(flatten)]
+    pub inner: dynamo_async_openai::types::CreateChatCompletionResponse,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub nvext: Option<serde_json::Value>,
+}

 /// A response structure for streamed chat completions, embedding OpenAI's
-/// `CreateChatCompletionStreamResponse`.
-///
-/// # Fields
-/// - `inner`: The base OpenAI streaming chat completion response, embedded
-///   using `serde(flatten)`.
-pub type NvCreateChatCompletionStreamResponse =
-    dynamo_async_openai::types::CreateChatCompletionStreamResponse;
+/// `CreateChatCompletionStreamResponse` with optional NVIDIA extension metadata.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
+pub struct NvCreateChatCompletionStreamResponse {
+    #[serde(flatten)]
+    pub inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub nvext: Option<serde_json::Value>,
+}

 /// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
 /// providing access to NVIDIA-specific extensions.

--- a/lib/llm/src/protocols/openai/chat_completions/aggregator.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/aggregator.rs
@@ -136,16 +136,16 @@ impl DeltaAggregator {
                if aggregator.error.is_none()
                    && let Some(delta) = delta.data
                {
-                    aggregator.id = delta.id;
-                    aggregator.model = delta.model;
-                    aggregator.created = delta.created;
-                    aggregator.service_tier = delta.service_tier;
+                    aggregator.id = delta.inner.id;
+                    aggregator.model = delta.inner.model;
+                    aggregator.created = delta.inner.created;
+                    aggregator.service_tier = delta.inner.service_tier;

                    // Aggregate usage statistics if available.
-                    if let Some(usage) = delta.usage {
+                    if let Some(usage) = delta.inner.usage {
                        aggregator.usage = Some(usage);
                    }
-                    if let Some(system_fingerprint) = delta.system_fingerprint {
+                    if let Some(system_fingerprint) = delta.inner.system_fingerprint {
                        aggregator.system_fingerprint = Some(system_fingerprint);
                    }

@@ -155,7 +155,7 @@ impl DeltaAggregator {
                    }

                    // Aggregate choices incrementally.
-                    for choice in delta.choices {
+                    for choice in delta.inner.choices {
                        let state_choice =
                            aggregator
                                .choices
@@ -267,14 +267,16 @@ impl DeltaAggregator {

        // Construct the final response object.
        let response = NvCreateChatCompletionResponse {
-            id: aggregator.id,
-            created: aggregator.created,
-            usage: aggregator.usage,
-            model: aggregator.model,
-            object: "chat.completion".to_string(),
-            system_fingerprint: aggregator.system_fingerprint,
-            choices,
-            service_tier: aggregator.service_tier,
+            inner: dynamo_async_openai::types::CreateChatCompletionResponse {
+                id: aggregator.id,
+                created: aggregator.created,
+                usage: aggregator.usage,
+                model: aggregator.model,
+                object: "chat.completion".to_string(),
+                system_fingerprint: aggregator.system_fingerprint,
+                choices,
+                service_tier: aggregator.service_tier,
+            },
            nvext: aggregator.nvext,
        };

@@ -360,7 +362,7 @@ pub trait ChatCompletionAggregator {
    ) -> Result<NvCreateChatCompletionResponse, String>;
 }

-impl ChatCompletionAggregator for dynamo_async_openai::types::CreateChatCompletionResponse {
+impl ChatCompletionAggregator for NvCreateChatCompletionResponse {
    async fn from_annotated_stream(
        stream: impl Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>>,
        parsing_options: ParsingOptions,
@@ -445,14 +447,16 @@ mod tests {
        };

        let data = NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            model: "meta/llama-3.1-8b-instruct".to_string(),
-            created: 1234567890,
-            service_tier: None,
-            usage: None,
-            system_fingerprint: None,
-            choices: vec![choice],
-            object: "chat.completion".to_string(),
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                model: "meta/llama-3.1-8b-instruct".to_string(),
+                created: 1234567890,
+                service_tier: None,
+                usage: None,
+                system_fingerprint: None,
+                choices: vec![choice],
+                object: "chat.completion".to_string(),
+            },
            nvext: None,
        };

@@ -479,13 +483,13 @@ mod tests {
        let response = result.unwrap();

        // Verify that the response is empty and has default values
-        assert_eq!(response.id, "");
-        assert_eq!(response.model, "");
-        assert_eq!(response.created, 0);
-        assert!(response.usage.is_none());
-        assert!(response.system_fingerprint.is_none());
-        assert_eq!(response.choices.len(), 0);
-        assert!(response.service_tier.is_none());
+        assert_eq!(response.inner.id, "");
+        assert_eq!(response.inner.model, "");
+        assert_eq!(response.inner.created, 0);
+        assert!(response.inner.usage.is_none());
+        assert!(response.inner.system_fingerprint.is_none());
+        assert_eq!(response.inner.choices.len(), 0);
+        assert!(response.inner.service_tier.is_none());
    }

    #[tokio::test]
@@ -511,13 +515,13 @@ mod tests {
        let response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.id, "test_id");
-        assert_eq!(response.model, "meta/llama-3.1-8b-instruct");
-        assert_eq!(response.created, 1234567890);
-        assert!(response.usage.is_none());
-        assert!(response.system_fingerprint.is_none());
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.id, "test_id");
+        assert_eq!(response.inner.model, "meta/llama-3.1-8b-instruct");
+        assert_eq!(response.inner.created, 1234567890);
+        assert!(response.inner.usage.is_none());
+        assert!(response.inner.system_fingerprint.is_none());
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];
        assert_eq!(choice.index, 0);
        assert_eq!(
            choice.message.content.as_ref().unwrap(),
@@ -525,7 +529,7 @@ mod tests {
        );
        assert!(choice.finish_reason.is_none());
        assert_eq!(choice.message.role, dynamo_async_openai::types::Role::User);
-        assert!(response.service_tier.is_none());
+        assert!(response.inner.service_tier.is_none());
    }

    #[tokio::test]
@@ -562,8 +566,8 @@ mod tests {
        let response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];
        assert_eq!(choice.index, 0);
        assert_eq!(
            choice.message.content.as_ref().unwrap(),
@@ -630,8 +634,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        assert_eq!(choice.index, 0);
        assert_eq!(
@@ -653,43 +657,49 @@ mod tests {
        // Create a delta with multiple choices
        // ALLOW: function_call is deprecated
        let data = NvCreateChatCompletionStreamResponse {
-            id: "test_id".to_string(),
-            model: "test_model".to_string(),
-            created: 1234567890,
-            service_tier: None,
-            usage: None,
-            system_fingerprint: None,
-            choices: vec![
-                dynamo_async_openai::types::ChatChoiceStream {
-                    index: 0,
-                    delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
-                        role: Some(dynamo_async_openai::types::Role::Assistant),
-                        content: Some(ChatCompletionMessageContent::Text("Choice 0".to_string())),
-                        function_call: None,
-                        tool_calls: None,
-                        refusal: None,
-                        reasoning_content: None,
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: "test_id".to_string(),
+                model: "test_model".to_string(),
+                created: 1234567890,
+                service_tier: None,
+                usage: None,
+                system_fingerprint: None,
+                choices: vec![
+                    dynamo_async_openai::types::ChatChoiceStream {
+                        index: 0,
+                        delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
+                            role: Some(dynamo_async_openai::types::Role::Assistant),
+                            content: Some(ChatCompletionMessageContent::Text(
+                                "Choice 0".to_string(),
+                            )),
+                            function_call: None,
+                            tool_calls: None,
+                            refusal: None,
+                            reasoning_content: None,
+                        },
+                        finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
+                        stop_reason: None,
+                        logprobs: None,
                    },
-                    finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
-                    stop_reason: None,
-                    logprobs: None,
-                },
-                dynamo_async_openai::types::ChatChoiceStream {
-                    index: 1,
-                    delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
-                        role: Some(dynamo_async_openai::types::Role::Assistant),
-                        content: Some(ChatCompletionMessageContent::Text("Choice 1".to_string())),
-                        function_call: None,
-                        tool_calls: None,
-                        refusal: None,
-                        reasoning_content: None,
+                    dynamo_async_openai::types::ChatChoiceStream {
+                        index: 1,
+                        delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
+                            role: Some(dynamo_async_openai::types::Role::Assistant),
+                            content: Some(ChatCompletionMessageContent::Text(
+                                "Choice 1".to_string(),
+                            )),
+                            function_call: None,
+                            tool_calls: None,
+                            refusal: None,
+                            reasoning_content: None,
+                        },
+                        finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
+                        stop_reason: None,
+                        logprobs: None,
                    },
-                    finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
-                    stop_reason: None,
-                    logprobs: None,
-                },
-            ],
-            object: "chat.completion".to_string(),
+                ],
+                object: "chat.completion".to_string(),
+            },
            nvext: None,
        };

@@ -711,9 +721,9 @@ mod tests {
        let mut response = result.unwrap();

        // Verify the response fields
-        assert_eq!(response.choices.len(), 2);
-        response.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
-        let choice0 = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 2);
+        response.inner.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
+        let choice0 = &response.inner.choices[0];
        assert_eq!(choice0.index, 0);
        assert_eq!(
            choice0.message.content.as_ref().unwrap(),
@@ -728,7 +738,7 @@ mod tests {
            dynamo_async_openai::types::Role::Assistant
        );

-        let choice1 = &response.choices[1];
+        let choice1 = &response.inner.choices[1];
        assert_eq!(choice1.index, 1);
        assert_eq!(
            choice1.message.content.as_ref().unwrap(),
@@ -773,8 +783,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // Verify tool calls are present
        assert!(choice.message.tool_calls.is_some());
@@ -816,8 +826,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // Verify tool calls are present
        assert!(choice.message.tool_calls.is_some());
@@ -859,8 +869,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // Verify tool calls are present
        assert!(choice.message.tool_calls.is_some());
@@ -900,8 +910,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // Verify no tool calls are present
        assert!(choice.message.tool_calls.is_none());
@@ -928,7 +938,7 @@ mod tests {

        // Manually set empty tool calls array
        if let Some(ref mut data) = annotated_delta.data {
-            data.choices[0].delta.tool_calls = Some(vec![]); // Empty tool calls array
+            data.inner.choices[0].delta.tool_calls = Some(vec![]); // Empty tool calls array
        }

        let data = annotated_delta.data.unwrap();
@@ -945,8 +955,8 @@ mod tests {

        assert!(result.is_ok());
        let response = result.unwrap();
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // Verify tool calls array is empty
        assert!(choice.message.tool_calls.is_none());
@@ -992,8 +1002,8 @@ mod tests {
        let response = result.unwrap();

        // There should be one choice
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // The tool_calls field should be present and parsed
        assert!(choice.message.tool_calls.is_some());
@@ -1050,8 +1060,8 @@ mod tests {
        let response = result.unwrap();

        // There should be one choice
-        assert_eq!(response.choices.len(), 1);
-        let choice = &response.choices[0];
+        assert_eq!(response.inner.choices.len(), 1);
+        let choice = &response.inner.choices[0];

        // The finish_reason should be ToolCalls, not Stop, because tool calls are present
        assert_eq!(

--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -278,19 +278,21 @@ impl DeltaGenerator {
        // According to OpenAI spec: when stream_options.include_usage is true,
        // all intermediate chunks should have usage: null
        // The final usage chunk will be sent separately with empty choices
-        dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-            id: self.id.clone(),
-            object: self.object.clone(),
-            created: self.created,
-            model: self.model.clone(),
-            system_fingerprint: self.system_fingerprint.clone(),
-            choices,
-            usage: if self.options.enable_usage && self.options.continuous_usage_stats {
-                Some(self.get_usage())
-            } else {
-                None
+        NvCreateChatCompletionStreamResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: self.id.clone(),
+                object: self.object.clone(),
+                created: self.created,
+                model: self.model.clone(),
+                system_fingerprint: self.system_fingerprint.clone(),
+                choices,
+                usage: if self.options.enable_usage && self.options.continuous_usage_stats {
+                    Some(self.get_usage())
+                } else {
+                    None
+                },
+                service_tier: self.service_tier.clone(),
            },
-            service_tier: self.service_tier.clone(),
            nvext: None, // Will be populated by router layer if needed
        }
    }
@@ -303,15 +305,17 @@ impl DeltaGenerator {
    pub fn create_usage_chunk(&self) -> NvCreateChatCompletionStreamResponse {
        let usage = self.get_usage();

-        dynamo_async_openai::types::CreateChatCompletionStreamResponse {
-            id: self.id.clone(),
-            object: self.object.clone(),
-            created: self.created,
-            model: self.model.clone(),
-            system_fingerprint: self.system_fingerprint.clone(),
-            choices: vec![], // Empty choices for usage-only chunk
-            usage: Some(usage),
-            service_tier: self.service_tier.clone(),
+        NvCreateChatCompletionStreamResponse {
+            inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                id: self.id.clone(),
+                object: self.object.clone(),
+                created: self.created,
+                model: self.model.clone(),
+                system_fingerprint: self.system_fingerprint.clone(),
+                choices: vec![], // Empty choices for usage-only chunk
+                usage: Some(usage),
+                service_tier: self.service_tier.clone(),
+            },
            nvext: None,
        }
    }

--- a/lib/llm/src/protocols/openai/chat_completions/jail.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/jail.rs
@@ -525,13 +525,13 @@ impl JailedStream {
            // Process each item in the stream
            while let Some(response) = stream.next().await {
                if let Some(chat_response) = response.data.as_ref() {
-                    last_stream_id.clone_from(&chat_response.id);
-                    last_stream_model.clone_from(&chat_response.model);
-                    last_stream_created = chat_response.created;
+                    last_stream_id.clone_from(&chat_response.inner.id);
+                    last_stream_model.clone_from(&chat_response.inner.model);
+                    last_stream_created = chat_response.inner.created;

                    let mut all_emissions = Vec::new();

-                    if chat_response.choices.is_empty() {
+                    if chat_response.inner.choices.is_empty() {
                        // No choices processed (e.g., usage-only chunk)
                        // Pass through as-is to preserve usage and other metadata
                        yield response;
@@ -539,7 +539,7 @@ impl JailedStream {
                    }

                    // Process each choice independently using the new architecture
-                    for choice in &chat_response.choices {
+                    for choice in &chat_response.inner.choices {
                        if let Some(ref content) = choice.delta.content {
                            // Jailing only applies to text content
                            let text_content = match content {
@@ -676,14 +676,16 @@ impl JailedStream {
                tracing::debug!("Stream ended while jailed, releasing accumulated content");
                // Create a finalization response carrying forward real stream metadata
                let dummy_response = NvCreateChatCompletionStreamResponse {
-                    id: last_stream_id,
-                    object: "chat.completion.chunk".to_string(),
-                    created: last_stream_created,
-                    model: last_stream_model,
-                    choices: Vec::new(),
-                    usage: None,
-                    service_tier: None,
-                    system_fingerprint: None,
+                    inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
+                        id: last_stream_id,
+                        object: "chat.completion.chunk".to_string(),
+                        created: last_stream_created,
+                        model: last_stream_model,
+                        choices: Vec::new(),
+                        usage: None,
+                        service_tier: None,
+                        system_fingerprint: None,
+                    },
                    nvext: None,
                };

@@ -713,7 +715,7 @@ impl JailedStream {
            EmissionMode::Packed => {
                // Pack all choices into a single response
                let mut response = base_response.clone();
-                response.choices = emissions.into_iter().map(|e| e.into_choice()).collect();
+                response.inner.choices = emissions.into_iter().map(|e| e.into_choice()).collect();

                vec![Annotated {
                    data: Some(response),
@@ -729,7 +731,7 @@ impl JailedStream {
                    .into_iter()
                    .map(|emission| {
                        let mut response = base_response.clone();
-                        response.choices = vec![emission.into_choice()];
+                        response.inner.choices = vec![emission.into_choice()];

                        Annotated {
                            data: Some(response),
@@ -1013,7 +1015,7 @@ impl JailedStream {
            while let Some(mut response) = input_stream.next().await {
                // Track if any choice emitted tool calls
                if let Some(ref data) = response.data {
-                    for choice in &data.choices {
+                    for choice in &data.inner.choices {
                        if choice.delta.tool_calls.is_some() {
                            has_tool_calls_per_choice.insert(choice.index, true);
                        }
@@ -1022,7 +1024,7 @@ impl JailedStream {

                // Fix finish_reason based on jail mode and whether tool calls were emitted
                if let Some(ref mut data) = response.data {
-                    for choice in &mut data.choices {
+                    for choice in &mut data.inner.choices {
                        if let Some(finish) = choice.finish_reason {
                            // Only modify Stop finish reason, preserve Length/ContentFilter
                            if finish == FinishReason::Stop {

--- a/lib/llm/src/protocols/openai/completions.rs
+++ b/lib/llm/src/protocols/openai/completions.rs
@@ -48,6 +48,8 @@ pub struct NvCreateCompletionRequest {
 pub struct NvCreateCompletionResponse {
    #[serde(flatten)]
    pub inner: dynamo_async_openai::types::CreateCompletionResponse,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub nvext: Option<serde_json::Value>,
 }

 impl ContentProvider for dynamo_async_openai::types::Choice {
@@ -296,9 +298,8 @@ impl ResponseFactory {
            choices: vec![choice],
            system_fingerprint: self.system_fingerprint.clone(),
            usage,
-            nvext: None, // Will be populated by router layer if needed
        };
-        NvCreateCompletionResponse { inner }
+        NvCreateCompletionResponse { inner, nvext: None }
    }
 }


--- a/lib/llm/src/protocols/openai/completions/aggregator.rs
+++ b/lib/llm/src/protocols/openai/completions/aggregator.rs
@@ -86,8 +86,8 @@ impl DeltaAggregator {
                        aggregator.system_fingerprint = Some(system_fingerprint);
                    }
                    // Aggregate nvext field (take the last non-None value)
-                    if delta.inner.nvext.is_some() {
-                        aggregator.nvext = delta.inner.nvext;
+                    if delta.nvext.is_some() {
+                        aggregator.nvext = delta.nvext;
                    }

                    // handle the choices
@@ -168,10 +168,12 @@ impl DeltaAggregator {
            object: "text_completion".to_string(),
            system_fingerprint: aggregator.system_fingerprint,
            choices,
-            nvext: aggregator.nvext,
        };

-        let response = NvCreateCompletionResponse { inner };
+        let response = NvCreateCompletionResponse {
+            inner,
+            nvext: aggregator.nvext,
+        };

        Ok(response)
    }
@@ -256,10 +258,9 @@ mod tests {
                logprobs,
            }],
            object: "text_completion".to_string(),
-            nvext: None,
        };

-        let response = NvCreateCompletionResponse { inner };
+        let response = NvCreateCompletionResponse { inner, nvext: None };

        Annotated {
            data: Some(response),
@@ -387,10 +388,9 @@ mod tests {
                },
            ],
            object: "text_completion".to_string(),
-            nvext: None,
        };

-        let response = NvCreateCompletionResponse { inner };
+        let response = NvCreateCompletionResponse { inner, nvext: None };

        let annotated_delta = Annotated {
            data: Some(response),

--- a/lib/llm/src/protocols/openai/completions/delta.rs
+++ b/lib/llm/src/protocols/openai/completions/delta.rs
@@ -218,10 +218,9 @@ impl DeltaGenerator {
            } else {
                None
            },
-            nvext: None, // Will be populated by router layer if needed
        };

-        NvCreateCompletionResponse { inner }
+        NvCreateCompletionResponse { inner, nvext: None }
    }

    /// Creates a final usage-only chunk for OpenAI compliance.
@@ -240,10 +239,9 @@ impl DeltaGenerator {
            system_fingerprint: self.system_fingerprint.clone(),
            choices: vec![], // Empty choices for usage-only chunk
            usage: Some(usage),
-            nvext: None, // Will be populated by router layer if needed
        };

-        NvCreateCompletionResponse { inner }
+        NvCreateCompletionResponse { inner, nvext: None }
    }

    /// Check if usage tracking is enabled
@@ -343,7 +341,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
            };

            if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
-                response.inner.nvext = Some(nvext_json);
+                response.nvext = Some(nvext_json);
                if let Some(ref info) = worker_id_info {
                    tracing::debug!(
                        "Injected worker_id into completions nvext: prefill={:?}, decode={:?}",