feat: Full Anthropic Messages API cache_control support (top-level, per-block,...

feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629) Signed-off-by: Matej Kosec <mkosec@nvidia.com>

feat: Full Anthropic Messages API cache_control support (top-level, per-block,...
feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629) Signed-off-by: Matej Kosec <mkosec@nvidia.com>
4d3e1ae3 · MatejKosec · GitHub · a3cf35c3 · 4d3e1ae3 · 4d3e1ae3
Unverified Commit 4d3e1ae3 authored Mar 02, 2026 by MatejKosec Committed by GitHub Mar 02, 2026
5 changed files
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -272,6 +272,9 @@ impl OpenAIPreprocessor {
        builder.mdc_sum(Some(self.mdcsum.clone()));
        let lora_name = self.lora_name.clone();
+        // Extract cache_control TTL from either nvext or top-level field
+        let cache_control_ttl = request.effective_cache_control().map(|cc| cc.ttl_seconds());
        // Extract routing hints from nvext if present
        if let Some(nvext) = request.nvext() {
            // Build routing hints from nvext fields
@@ -289,10 +292,12 @@ impl OpenAIPreprocessor {
                allowed_worker_ids: None,
            };
            builder.routing(Some(routing));
-        } else if lora_name.is_some() {
+        } else if lora_name.is_some() || cache_control_ttl.is_some() {
-            // Ensure LoRA-aware routing still gets hints even when nvext is absent.
+            // Ensure routing hints exist when we have LoRA or cache_control,
+            // even when nvext is absent (e.g. Anthropic endpoint requests).
            builder.routing(Some(RoutingHints {
                lora_name,
+                cache_control_ttl,
                ..Default::default()
            }));
        }

--- a/lib/llm/src/protocols/anthropic/stream_converter.rs
+++ b/lib/llm/src/protocols/anthropic/stream_converter.rs
@@ -30,6 +30,7 @@ pub struct AnthropicStreamConverter {
    // Token usage (from engine)
    input_token_count: u32,
    output_token_count: u32,
+    cached_token_count: Option<u32>,
    // Tool call tracking
    tool_call_states: Vec<ToolCallState>,
    tool_calls_sent: HashSet<String>,
@@ -57,6 +58,7 @@ impl AnthropicStreamConverter {
            text_block_index: 0,
            input_token_count: 0,
            output_token_count: 0,
+            cached_token_count: None,
            tool_call_states: Vec::new(),
            tool_calls_sent: HashSet::new(),
            next_block_index: 0,
@@ -77,6 +79,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: 0,
                output_tokens: 0,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: None,
            },
        };
@@ -95,6 +99,10 @@ impl AnthropicStreamConverter {
        if let Some(usage) = &chunk.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
+            self.cached_token_count = usage
+                .prompt_tokens_details
+                .as_ref()
+                .and_then(|d| d.cached_tokens);
        }
        for choice in &chunk.choices {
@@ -138,6 +146,7 @@ impl AnthropicStreamConverter {
                        index: self.text_block_index,
                        content_block: AnthropicResponseContentBlock::Text {
                            text: String::new(),
+                            citations: None,
                        },
                    };
                    events.push(make_sse_event("content_block_start", &block_start));
@@ -271,6 +280,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: self.input_token_count,
                output_tokens: self.output_token_count,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: self.cached_token_count,
            },
        };
        events.push(make_sse_event("message_delta", &message_delta));
@@ -329,6 +340,10 @@ impl AnthropicStreamConverter {
        if let Some(usage) = &chunk.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
+            self.cached_token_count = usage
+                .prompt_tokens_details
+                .as_ref()
+                .and_then(|d| d.cached_tokens);
        }
        for choice in &chunk.choices {
@@ -369,6 +384,7 @@ impl AnthropicStreamConverter {
                        index: self.text_block_index,
                        content_block: AnthropicResponseContentBlock::Text {
                            text: String::new(),
+                            citations: None,
                        },
                    };
                    events.push(make_tagged_event("content_block_start", &ev));
@@ -483,6 +499,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: self.input_token_count,
                output_tokens: self.output_token_count,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: self.cached_token_count,
            },
        };
        events.push(make_tagged_event("message_delta", &ev));

--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
@@ -22,15 +22,26 @@ use crate::protocols::openai::chat_completions::{
    NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
 };
 use crate::protocols::openai::common_ext::CommonExt;
+use crate::protocols::openai::nvext::{CacheControl, NvExt};
 // ---------------------------------------------------------------------------
 // Custom deserializers
 // ---------------------------------------------------------------------------
+/// Parsed system prompt content, preserving cache_control from block arrays.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SystemContent {
+    /// The concatenated text from all system blocks (or the plain string).
+    pub text: String,
+    /// Cache control from the last system block that had one.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+}
 /// Deserialize `system` from either a plain string or an array of text blocks.
 /// The Anthropic API accepts both `"system": "text"` and
-/// `"system": [{"type": "text", "text": "..."}]`.
+/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
-fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
+fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
 where
    D: serde::Deserializer<'de>,
 {
@@ -44,16 +55,28 @@ where
    #[derive(Deserialize)]
    struct SystemBlock {
        text: String,
+        #[serde(default)]
+        cache_control: Option<CacheControl>,
    }
    let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
    Ok(maybe.map(|sp| match sp {
-        SystemPrompt::Text(s) => s,
+        SystemPrompt::Text(s) => SystemContent {
-        SystemPrompt::Blocks(blocks) => blocks
+            text: s,
+            cache_control: None,
+        },
+        SystemPrompt::Blocks(blocks) => {
+            let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
+            let text = blocks
                .into_iter()
                .map(|b| b.text)
                .collect::<Vec<_>>()
-            .join("\n"),
+                .join("\n");
+            SystemContent {
+                text,
+                cache_control,
+            }
+        }
    }))
 }
@@ -79,7 +102,7 @@ pub struct AnthropicCreateMessageRequest {
        skip_serializing_if = "Option::is_none",
        deserialize_with = "deserialize_system_prompt"
    )]
-    pub system: Option<String>,
+    pub system: Option<SystemContent>,
    /// Sampling temperature (0.0 - 1.0).
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -112,6 +135,50 @@ pub struct AnthropicCreateMessageRequest {
    /// How the model should choose which tool to call.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_choice: Option<AnthropicToolChoice>,
+    /// Top-level cache control for automatic prompt prefix caching.
+    /// When present, the system caches all content up to the last cacheable block.
+    /// Matches the Anthropic Messages API automatic caching mode.
+    /// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
+    /// Extended thinking configuration. When enabled, the model produces
+    /// `thinking` content blocks containing its internal reasoning before
+    /// the final response. The `budget_tokens` field controls how many tokens
+    /// the model may use for thinking (must be ≥ 1024 and < max_tokens).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub thinking: Option<ThinkingConfig>,
+    /// Service tier selection: `"auto"` or `"standard_only"`.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub service_tier: Option<String>,
+    /// Container identifier for stateful sandbox sessions.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub container: Option<String>,
+    /// Output configuration: effort level and optional JSON schema format.
+    /// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
+    /// `format` specifies structured JSON output constraints.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub output_config: Option<serde_json::Value>,
+}
+/// Extended thinking configuration for the request.
+///
+/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
+/// with its internal reasoning. `budget_tokens` controls the maximum tokens
+/// available for thinking (minimum 1024, must be less than `max_tokens`).
+/// When `type` is `"disabled"`, no thinking blocks are produced.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ThinkingConfig {
+    /// Either `"enabled"` or `"disabled"`.
+    #[serde(rename = "type")]
+    pub thinking_type: String,
+    /// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub budget_tokens: Option<u32>,
 }
 /// A single message in the conversation.
@@ -143,15 +210,23 @@ pub enum AnthropicMessageContent {
 /// A single content block within a message.
 ///
 /// Uses a custom deserializer so that unknown block types (e.g. `citations`,
-/// `server_tool_use`, `redacted_thinking`) are captured as `Unknown` instead
+/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
 /// of causing a hard deserialization failure. This is important because Claude
 /// Code may send block types that we don't yet handle.
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "type")]
 pub enum AnthropicContentBlock {
-    /// Text content block.
+    /// Text content block. May optionally include `citations` — references to
+    /// source documents that support the text content. Citations are generated
+    /// by the model when document/PDF content is provided and citation mode is enabled.
    #[serde(rename = "text")]
-    Text { text: String },
+    Text {
+        text: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        citations: Option<Vec<serde_json::Value>>,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
+    },
    /// Image content block.
    #[serde(rename = "image")]
    Image { source: AnthropicImageSource },
@@ -161,6 +236,8 @@ pub enum AnthropicContentBlock {
        id: String,
        name: String,
        input: serde_json::Value,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
    },
    /// Tool result from user.
    #[serde(rename = "tool_result")]
@@ -170,14 +247,45 @@ pub enum AnthropicContentBlock {
        content: Option<ToolResultContent>,
        #[serde(skip_serializing_if = "Option::is_none")]
        is_error: Option<bool>,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        cache_control: Option<CacheControl>,
    },
    /// Thinking content block from assistant (extended thinking / reasoning).
    #[serde(rename = "thinking")]
-    Thinking { thinking: String, signature: String },
+    Thinking {
-    /// Catch-all for unrecognized block types. Silently accepted and skipped
+        thinking: String,
-    /// during conversion so that new Anthropic features don't break the endpoint.
+        signature: String,
-    #[serde(skip)]
+        #[serde(default, skip_serializing_if = "Option::is_none")]
-    Unknown { block_type: String },
+        cache_control: Option<CacheControl>,
+    },
+    /// Redacted thinking block from assistant. Contains encrypted reasoning data
+    /// that is opaque to the client but must be passed back verbatim in multi-turn
+    /// conversations so the model can maintain its chain of thought.
+    #[serde(rename = "redacted_thinking")]
+    RedactedThinking { data: String },
+    /// Server-initiated tool use block. Represents a tool call that the API
+    /// executes server-side (e.g., web search). The client receives the result
+    /// via a corresponding `web_search_tool_result` or similar block.
+    #[serde(rename = "server_tool_use")]
+    ServerToolUse {
+        id: String,
+        name: String,
+        #[serde(default)]
+        input: serde_json::Value,
+    },
+    /// Result from a server-initiated tool (e.g., web search results).
+    /// Contains structured content returned by the server-side tool execution.
+    #[serde(rename = "web_search_tool_result")]
+    WebSearchToolResult {
+        tool_use_id: String,
+        #[serde(default)]
+        content: serde_json::Value,
+    },
+    /// Catch-all for unrecognized block types. Preserves the full JSON value
+    /// so that new Anthropic features don't break the endpoint and can be
+    /// round-tripped or inspected.
+    #[serde(untagged)]
+    Other(serde_json::Value),
 }
 /// Content of a `tool_result` block — either a plain string or an array of
@@ -237,9 +345,21 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
                let text = value
                    .get("text")
                    .and_then(|t| t.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("text"))?
                    .to_string();
-                Ok(AnthropicContentBlock::Text { text })
+                let citations: Option<Vec<serde_json::Value>> = value
+                    .get("citations")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::Text {
+                    text,
+                    citations,
+                    cache_control,
+                })
            }
            "image" => {
                let source: AnthropicImageSource =
@@ -251,55 +371,112 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
                let id = value
                    .get("id")
                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
                    .to_string();
                let name = value
                    .get("name")
                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
                    .to_string();
                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
-                Ok(AnthropicContentBlock::ToolUse { id, name, input })
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
+                Ok(AnthropicContentBlock::ToolUse {
+                    id,
+                    name,
+                    input,
+                    cache_control,
+                })
            }
            "tool_result" => {
                let tool_use_id = value
                    .get("tool_use_id")
                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
                    .to_string();
                let content: Option<ToolResultContent> = value
                    .get("content")
                    .cloned()
                    .and_then(|v| serde_json::from_value(v).ok());
                let is_error = value.get("is_error").and_then(|v| v.as_bool());
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
                Ok(AnthropicContentBlock::ToolResult {
                    tool_use_id,
                    content,
                    is_error,
+                    cache_control,
                })
            }
            "thinking" => {
                let thinking = value
                    .get("thinking")
                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("thinking"))?
                    .to_string();
                let signature = value
                    .get("signature")
                    .and_then(|v| v.as_str())
-                    .unwrap_or("")
+                    .ok_or_else(|| serde::de::Error::missing_field("signature"))?
                    .to_string();
+                let cache_control: Option<CacheControl> = value
+                    .get("cache_control")
+                    .cloned()
+                    .and_then(|v| serde_json::from_value(v).ok());
                Ok(AnthropicContentBlock::Thinking {
                    thinking,
                    signature,
+                    cache_control,
                })
            }
-            other => {
+            "redacted_thinking" => {
-                tracing::debug!("Unknown Anthropic content block type '{}', skipping", other);
+                let data = value
-                Ok(AnthropicContentBlock::Unknown {
+                    .get("data")
-                    block_type: other.to_string(),
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("data"))?
+                    .to_string();
+                Ok(AnthropicContentBlock::RedactedThinking { data })
+            }
+            "server_tool_use" => {
+                let id = value
+                    .get("id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("id"))?
+                    .to_string();
+                let name = value
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("name"))?
+                    .to_string();
+                let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
+                Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
+            }
+            "web_search_tool_result" => {
+                let tool_use_id = value
+                    .get("tool_use_id")
+                    .and_then(|v| v.as_str())
+                    .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
+                    .to_string();
+                let content = value
+                    .get("content")
+                    .cloned()
+                    .unwrap_or(serde_json::json!([]));
+                Ok(AnthropicContentBlock::WebSearchToolResult {
+                    tool_use_id,
+                    content,
                })
            }
+            other => {
+                tracing::debug!(
+                    "Unrecognized Anthropic content block type '{}', preserving as Other",
+                    other
+                );
+                Ok(AnthropicContentBlock::Other(value))
+            }
        }
    }
 }
@@ -314,12 +491,29 @@ pub struct AnthropicImageSource {
 }
 /// A tool definition.
+///
+/// Client tools (custom) require `name` + `input_schema`. Server tools
+/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
+/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
+/// `input_schema`. We keep all fields optional beyond `name` so both
+/// kinds deserialize successfully and pass through to the backend.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct AnthropicTool {
+    /// Tool name (required for client tools, present on server tools too).
    pub name: String,
+    /// Tool type discriminator. Client tools use `"custom"` (or omit).
+    /// Server tools use versioned types like `"web_search_20260209"`.
+    #[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
+    pub tool_type: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub description: Option<String>,
-    pub input_schema: serde_json::Value,
+    /// JSON Schema for the tool input. Required for client tools, absent on
+    /// server tools (which define their own input shape server-side).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub input_schema: Option<serde_json::Value>,
+    /// Cache control breakpoint on this tool definition.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_control: Option<CacheControl>,
 }
 /// Tool choice specification.
@@ -338,6 +532,10 @@ pub enum AnthropicToolChoice {
 pub struct AnthropicToolChoiceSimple {
    #[serde(rename = "type")]
    pub choice_type: AnthropicToolChoiceMode,
+    /// When true, the model will call tools one at a time instead of
+    /// potentially issuing multiple tool calls in a single response.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub disable_parallel_tool_use: Option<bool>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -355,6 +553,10 @@ pub struct AnthropicToolChoiceNamed {
    #[serde(rename = "type")]
    pub choice_type: AnthropicToolChoiceMode,
    pub name: String,
+    /// When true, the model will call tools one at a time instead of
+    /// potentially issuing multiple tool calls in a single response.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub disable_parallel_tool_use: Option<bool>,
 }
 // ---------------------------------------------------------------------------
@@ -376,17 +578,47 @@ pub struct AnthropicMessageResponse {
 }
 /// A content block in the response.
+///
+/// The Anthropic API returns up to 12 different block types. We model the
+/// common ones explicitly and catch the rest as `Other` so the proxy can
+/// forward them without losing data.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "type")]
 pub enum AnthropicResponseContentBlock {
    #[serde(rename = "text")]
-    Text { text: String },
+    Text {
+        text: String,
+        #[serde(default, skip_serializing_if = "Option::is_none")]
+        citations: Option<Vec<serde_json::Value>>,
+    },
    #[serde(rename = "tool_use")]
    ToolUse {
        id: String,
        name: String,
        input: serde_json::Value,
    },
+    #[serde(rename = "thinking")]
+    Thinking { thinking: String, signature: String },
+    #[serde(rename = "redacted_thinking")]
+    RedactedThinking { data: String },
+    #[serde(rename = "server_tool_use")]
+    ServerToolUse {
+        id: String,
+        name: String,
+        #[serde(default)]
+        input: serde_json::Value,
+    },
+    #[serde(rename = "web_search_tool_result")]
+    WebSearchToolResult {
+        tool_use_id: String,
+        #[serde(default)]
+        content: serde_json::Value,
+    },
+    /// Catch-all for new/uncommon block types (web_fetch_tool_result,
+    /// code_execution_tool_result, container_upload, etc.) so the proxy
+    /// can serialize them back without data loss.
+    #[serde(untagged)]
+    Other(serde_json::Value),
 }
 /// Token usage information.
@@ -394,6 +626,12 @@ pub enum AnthropicResponseContentBlock {
 pub struct AnthropicUsage {
    pub input_tokens: u32,
    pub output_tokens: u32,
+    /// Number of input tokens used to create a new cache entry.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_creation_input_tokens: Option<u32>,
+    /// Number of input tokens read from the prompt cache (prefix cache hits).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub cache_read_input_tokens: Option<u32>,
 }
 /// Reason the model stopped generating.
@@ -404,6 +642,11 @@ pub enum AnthropicStopReason {
    MaxTokens,
    StopSequence,
    ToolUse,
+    /// The model paused to yield control in an agentic loop, intending to
+    /// continue in a subsequent turn. Used with extended thinking / tool use.
+    PauseTurn,
+    /// The model refused to generate content (safety refusal).
+    Refusal,
 }
 // ---------------------------------------------------------------------------
@@ -453,6 +696,15 @@ pub enum AnthropicDelta {
    TextDelta { text: String },
    #[serde(rename = "input_json_delta")]
    InputJsonDelta { partial_json: String },
+    /// Incremental thinking content during extended thinking streaming.
+    #[serde(rename = "thinking_delta")]
+    ThinkingDelta { thinking: String },
+    /// Incremental signature for a thinking block (sent at the end).
+    #[serde(rename = "signature_delta")]
+    SignatureDelta { signature: String },
+    /// Incremental citation attached to a text block.
+    #[serde(rename = "citations_delta")]
+    CitationsDelta { citation: serde_json::Value },
 }
 /// The delta body in a message_delta event.
@@ -529,10 +781,12 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
        let mut messages = Vec::new();
        // Prepend system message if present
-        if let Some(system_text) = &req.system {
+        if let Some(system_content) = &req.system {
            messages.push(ChatCompletionRequestMessage::System(
                ChatCompletionRequestSystemMessage {
-                    content: ChatCompletionRequestSystemMessageContent::Text(system_text.clone()),
+                    content: ChatCompletionRequestSystemMessageContent::Text(
+                        system_content.text.clone(),
+                    ),
                    name: None,
                },
            ));
@@ -610,7 +864,41 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
                top_k: req.top_k.map(|k| k as i32),
                ..Default::default()
            },
-            nvext: None,
+            nvext: {
+                // Collect per-block cache_control: use the last one found
+                let mut last_block_cc: Option<CacheControl> = None;
+                for msg in &req.messages {
+                    if let AnthropicMessageContent::Blocks { content } = &msg.content {
+                        for block in content {
+                            let block_cc = match block {
+                                AnthropicContentBlock::Text { cache_control, .. } => {
+                                    cache_control.as_ref()
+                                }
+                                AnthropicContentBlock::ToolUse { cache_control, .. } => {
+                                    cache_control.as_ref()
+                                }
+                                AnthropicContentBlock::ToolResult { cache_control, .. } => {
+                                    cache_control.as_ref()
+                                }
+                                AnthropicContentBlock::Thinking { cache_control, .. } => {
+                                    cache_control.as_ref()
+                                }
+                                _ => None,
+                            };
+                            if let Some(cc) = block_cc {
+                                last_block_cc = Some(cc.clone());
+                            }
+                        }
+                    }
+                }
+                // Merge: top-level > per-block > system block cache_control
+                let system_cc = req.system.as_ref().and_then(|s| s.cache_control.clone());
+                let effective_cc = req.cache_control.clone().or(last_block_cc).or(system_cc);
+                effective_cc.map(|cc| NvExt {
+                    cache_control: Some(cc),
+                    ..Default::default()
+                })
+            },
            chat_template_args: None,
            media_io_kwargs: None,
            unsupported_fields: Default::default(),
@@ -629,7 +917,7 @@ fn convert_user_blocks(
    for block in blocks {
        match block {
-            AnthropicContentBlock::Text { text } => {
+            AnthropicContentBlock::Text { text, .. } => {
                text_parts.push(text.clone());
            }
            AnthropicContentBlock::ToolResult {
@@ -664,8 +952,11 @@ fn convert_user_blocks(
            }
            AnthropicContentBlock::ToolUse { .. }
            | AnthropicContentBlock::Thinking { .. }
-            | AnthropicContentBlock::Unknown { .. } => {
+            | AnthropicContentBlock::RedactedThinking { .. }
-                // tool_use/thinking/unknown in a user message: skip
+            | AnthropicContentBlock::ServerToolUse { .. }
+            | AnthropicContentBlock::WebSearchToolResult { .. }
+            | AnthropicContentBlock::Other(_) => {
+                // tool_use/thinking/server-side blocks/unknown in a user message: skip
            }
        }
    }
@@ -715,7 +1006,7 @@ fn convert_assistant_blocks(
    for block in blocks {
        match block {
-            AnthropicContentBlock::Text { text } => {
+            AnthropicContentBlock::Text { text, .. } => {
                text_content.push_str(text);
            }
            AnthropicContentBlock::Thinking { thinking, .. } => {
@@ -724,8 +1015,21 @@ fn convert_assistant_blocks(
                }
                pending_reasoning.push_str(thinking);
            }
-            AnthropicContentBlock::ToolUse { id, name, input } => {
+            AnthropicContentBlock::RedactedThinking { .. } => {
+                // Redacted thinking is encrypted model reasoning. We can't read
+                // it but we preserve its position so it's not silently dropped.
+                // The actual encrypted data would need to be passed back to the
+                // model in multi-turn conversations for context continuity.
+            }
+            AnthropicContentBlock::ToolUse {
+                id, name, input, ..
+            }
+            | AnthropicContentBlock::ServerToolUse {
+                id, name, input, ..
+            } => {
                // Snapshot the reasoning that preceded this tool call.
+                // Server-initiated tool use (e.g. web search) is treated the
+                // same as client tool use for conversion purposes.
                segments.push(std::mem::take(&mut pending_reasoning));
                tool_calls.push(ChatCompletionMessageToolCall {
                    id: id.clone(),
@@ -798,15 +1102,28 @@ fn convert_assistant_blocks(
 fn convert_anthropic_tools(tools: &[AnthropicTool]) -> Vec<ChatCompletionTool> {
    tools
        .iter()
-        .map(|tool| ChatCompletionTool {
+        .filter_map(|tool| {
+            // Server tools (web_search, bash, etc.) don't have input_schema
+            // and can't be meaningfully converted to OpenAI function tools.
+            // They are backend-specific and handled separately.
+            let schema = tool.input_schema.clone().or_else(|| {
+                tracing::debug!(
+                    tool_name = %tool.name,
+                    tool_type = ?tool.tool_type,
+                    "Skipping server tool in OpenAI conversion (no input_schema)"
+                );
+                None
+            })?;
+            Some(ChatCompletionTool {
                r#type: ChatCompletionToolType::Function,
                function: FunctionObject {
                    name: tool.name.clone(),
                    description: tool.description.clone(),
-                parameters: Some(tool.input_schema.clone()),
+                    parameters: Some(schema),
                    strict: None,
                },
            })
+        })
        .collect()
 }
@@ -877,6 +1194,20 @@ pub fn chat_completion_to_anthropic_response(
            }
        }
+        // Extract reasoning content (from --dyn-reasoning-parser, e.g. qwen3).
+        // The backend strips <think>...</think> from the text and surfaces it
+        // as reasoning_content on the message. Map this to a Thinking block
+        // so clients see proper extended thinking in the Anthropic response.
+        if let Some(thinking) = choice.message.reasoning_content.filter(|t| !t.is_empty()) {
+            content.insert(
+                0,
+                AnthropicResponseContentBlock::Thinking {
+                    thinking,
+                    signature: String::new(),
+                },
+            );
+        }
        // Extract text content
        let text = match choice.message.content {
            Some(dynamo_async_openai::types::ChatCompletionMessageContent::Text(t)) => Some(t),
@@ -889,8 +1220,11 @@ pub fn chat_completion_to_anthropic_response(
            None => None,
        };
        if let Some(text) = text {
-            // Text goes first in the content array
+            // Text goes after thinking block (if any)
-            content.insert(0, AnthropicResponseContentBlock::Text { text });
+            content.push(AnthropicResponseContentBlock::Text {
+                text,
+                citations: None,
+            });
        }
    }
@@ -898,15 +1232,24 @@ pub fn chat_completion_to_anthropic_response(
    if content.is_empty() {
        content.push(AnthropicResponseContentBlock::Text {
            text: String::new(),
+            citations: None,
        });
    }
    // Map usage
    let usage = chat_resp
        .usage
-        .map(|u| AnthropicUsage {
+        .map(|u| {
+            let cache_read_input_tokens = u
+                .prompt_tokens_details
+                .and_then(|d| d.cached_tokens)
+                .filter(|&n| n > 0);
+            AnthropicUsage {
                input_tokens: u.prompt_tokens,
                output_tokens: u.completion_tokens,
+                cache_creation_input_tokens: None, // Not available from OpenAI format
+                cache_read_input_tokens,
+            }
        })
        .unwrap_or_default();
@@ -936,7 +1279,7 @@ pub struct AnthropicCountTokensRequest {
        skip_serializing_if = "Option::is_none",
        deserialize_with = "deserialize_system_prompt"
    )]
-    pub system: Option<String>,
+    pub system: Option<SystemContent>,
    #[serde(default)]
    pub tools: Option<Vec<AnthropicTool>>,
 }
@@ -953,7 +1296,7 @@ impl AnthropicCountTokensRequest {
        let mut total_len: usize = 0;
        if let Some(system) = &self.system {
-            total_len += system.len();
+            total_len += system.text.len();
        }
        for msg in &self.messages {
@@ -979,7 +1322,9 @@ impl AnthropicCountTokensRequest {
                if let Some(desc) = &tool.description {
                    total_len += desc.len();
                }
-                total_len += tool.input_schema.to_string().len();
+                if let Some(schema) = &tool.input_schema {
+                    total_len += schema.to_string().len();
+                }
            }
        }
@@ -994,7 +1339,7 @@ impl AnthropicCountTokensRequest {
 fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
    match block {
-        AnthropicContentBlock::Text { text } => text.len(),
+        AnthropicContentBlock::Text { text, .. } => text.len(),
        AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
        AnthropicContentBlock::ToolResult { content, .. } => content
            .as_ref()
@@ -1010,8 +1355,13 @@ fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
            })
            .unwrap_or(0),
        AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
+        AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
+        AnthropicContentBlock::ServerToolUse { name, input, .. } => {
+            name.len() + input.to_string().len()
+        }
+        AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
        AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
-        AnthropicContentBlock::Unknown { .. } => 0,
+        AnthropicContentBlock::Other(v) => v.to_string().len(),
    }
 }
@@ -1043,6 +1393,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1073,7 +1428,10 @@ mod tests {
                    content: "Hi".into(),
                },
            }],
-            system: Some("You are helpful.".into()),
+            system: Some(SystemContent {
+                text: "You are helpful.".into(),
+                cache_control: None,
+            }),
            temperature: None,
            top_p: None,
            top_k: None,
@@ -1082,6 +1440,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1115,6 +1478,7 @@ mod tests {
                            id: "tool_123".into(),
                            name: "get_weather".into(),
                            input: serde_json::json!({"location": "SF"}),
+                            cache_control: None,
                        }],
                    },
                },
@@ -1125,6 +1489,7 @@ mod tests {
                            tool_use_id: "tool_123".into(),
                            content: Some(ToolResultContent::Text("72F and sunny".into())),
                            is_error: None,
+                            cache_control: None,
                        }],
                    },
                },
@@ -1138,6 +1503,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1176,6 +1546,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1202,16 +1577,24 @@ mod tests {
            metadata: None,
            tools: Some(vec![AnthropicTool {
                name: "get_weather".into(),
+                tool_type: None,
                description: Some("Get weather info".into()),
-                input_schema: serde_json::json!({
+                input_schema: Some(serde_json::json!({
                    "type": "object",
                    "properties": {"location": {"type": "string"}},
                    "required": ["location"]
-                }),
+                })),
+                cache_control: None,
            }]),
            tool_choice: Some(AnthropicToolChoice::Simple(AnthropicToolChoiceSimple {
                choice_type: AnthropicToolChoiceMode::Auto,
+                disable_parallel_tool_use: None,
            })),
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1274,7 +1657,7 @@ mod tests {
        assert_eq!(response.usage.output_tokens, 5);
        assert_eq!(response.content.len(), 1);
        match &response.content[0] {
-            AnthropicResponseContentBlock::Text { text } => {
+            AnthropicResponseContentBlock::Text { text, .. } => {
                assert_eq!(text, "Hello!");
            }
            _ => panic!("expected text block"),
@@ -1335,6 +1718,7 @@ mod tests {
                    AnthropicContentBlock::Thinking {
                        thinking,
                        signature,
+                        ..
                    } => {
                        assert_eq!(thinking, "Let me reason about this...");
                        assert_eq!(signature, "sig123");
@@ -1358,9 +1742,12 @@ mod tests {
                        AnthropicContentBlock::Thinking {
                            thinking: "I should think...".into(),
                            signature: "sig".into(),
+                            cache_control: None,
                        },
                        AnthropicContentBlock::Text {
                            text: "Answer".into(),
+                            citations: None,
+                            cache_control: None,
                        },
                    ],
                },
@@ -1374,6 +1761,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
@@ -1395,7 +1787,7 @@ mod tests {
    }
    #[test]
-    fn test_unknown_block_type_does_not_fail() {
+    fn test_known_and_unknown_block_types() {
        let json = r#"{
            "model": "test",
            "max_tokens": 100,
@@ -1405,6 +1797,8 @@ mod tests {
                    {"type": "text", "text": "hello"},
                    {"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {}},
                    {"type": "redacted_thinking", "data": "encrypted"},
+                    {"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://example.com"}]},
+                    {"type": "future_block_type", "some_field": 42},
                    {"type": "text", "text": "world"}
                ]
            }]
@@ -1412,22 +1806,32 @@ mod tests {
        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
        match &req.messages[0].content {
            AnthropicMessageContent::Blocks { content } => {
-                assert_eq!(content.len(), 4);
+                assert_eq!(content.len(), 6);
                assert!(matches!(&content[0], AnthropicContentBlock::Text { .. }));
                assert!(matches!(
                    &content[1],
-                    AnthropicContentBlock::Unknown { block_type } if block_type == "server_tool_use"
+                    AnthropicContentBlock::ServerToolUse { name, .. } if name == "web_search"
                ));
                assert!(matches!(
                    &content[2],
-                    AnthropicContentBlock::Unknown { block_type } if block_type == "redacted_thinking"
+                    AnthropicContentBlock::RedactedThinking { data } if data == "encrypted"
+                ));
+                assert!(matches!(
+                    &content[3],
+                    AnthropicContentBlock::WebSearchToolResult { tool_use_id, .. } if tool_use_id == "stu_1"
+                ));
+                // Truly unknown types still fall through to Other with full JSON preserved
+                assert!(matches!(
+                    &content[4],
+                    AnthropicContentBlock::Other(v) if v.get("type").and_then(|t| t.as_str()) == Some("future_block_type")
                ));
-                assert!(matches!(&content[3], AnthropicContentBlock::Text { .. }));
+                assert!(matches!(&content[5], AnthropicContentBlock::Text { .. }));
            }
            _ => panic!("expected blocks content"),
        }
-        // Conversion should succeed, skipping unknown blocks
+        // Conversion should succeed — server_tool_use becomes a tool call,
+        // redacted_thinking and web_search_tool_result are preserved gracefully
        let chat_req: NvCreateChatCompletionRequest = AnthropicCreateMessageRequest {
            model: "test".into(),
            max_tokens: 100,
@@ -1441,10 +1845,25 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        }
        .try_into()
        .unwrap();
+        // server_tool_use becomes a tool call on the assistant message
        assert_eq!(chat_req.inner.messages.len(), 1);
+        match &chat_req.inner.messages[0] {
+            ChatCompletionRequestMessage::Assistant(a) => {
+                assert!(a.tool_calls.is_some());
+                let tc = a.tool_calls.as_ref().unwrap();
+                assert_eq!(tc.len(), 1);
+                assert_eq!(tc[0].function.name, "web_search");
+            }
+            other => panic!("expected assistant, got {other:?}"),
+        }
    }
    #[test]
@@ -1510,7 +1929,10 @@ mod tests {
                    content: "Hello, world! This is a test message.".into(),
                },
            }],
-            system: Some("You are helpful.".into()),
+            system: Some(SystemContent {
+                text: "You are helpful.".into(),
+                cache_control: None,
+            }),
            tools: None,
        };
@@ -1539,6 +1961,11 @@ mod tests {
            metadata: None,
            tools: None,
            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
        };
        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
        match chat_req.inner.messages.into_iter().next().unwrap() {
@@ -1552,6 +1979,7 @@ mod tests {
            id: id.into(),
            name: "fn".into(),
            input: serde_json::json!({}),
+            cache_control: None,
        }
    }
@@ -1559,6 +1987,7 @@ mod tests {
        AnthropicContentBlock::Thinking {
            thinking: text.into(),
            signature: "sig".into(),
+            cache_control: None,
        }
    }
@@ -1686,6 +2115,8 @@ mod tests {
            thinking("A"),
            AnthropicContentBlock::Text {
                text: "answer".into(),
+                citations: None,
+                cache_control: None,
            },
        ]);
@@ -1776,4 +2207,387 @@ mod tests {
        assert_eq!(tools[0].id, "t1");
        assert_eq!(tools[1].id, "t2");
    }
+    #[test]
+    fn test_cache_control_passthrough() {
+        use crate::protocols::openai::nvext::{CacheControl, CacheControlType};
+        let req = AnthropicCreateMessageRequest {
+            model: "test-model".into(),
+            max_tokens: 100,
+            messages: vec![AnthropicMessage {
+                role: AnthropicRole::User,
+                content: AnthropicMessageContent::Text {
+                    content: "Hello".into(),
+                },
+            }],
+            system: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            stop_sequences: None,
+            stream: false,
+            metadata: None,
+            tools: None,
+            tool_choice: None,
+            cache_control: Some(CacheControl {
+                control_type: CacheControlType::Ephemeral,
+                ttl: None,
+            }),
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
+        };
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        let nvext = chat_req.nvext.expect("nvext should be set");
+        let cc = nvext
+            .cache_control
+            .expect("nvext.cache_control should be set");
+        assert_eq!(cc.control_type, CacheControlType::Ephemeral);
+        assert_eq!(cc.ttl_seconds(), 300);
+    }
+    #[test]
+    fn test_cache_control_1h_ttl_passthrough() {
+        use crate::protocols::openai::nvext::CacheControlType;
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "cache_control": {"type": "ephemeral", "ttl": "1h"}
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        assert!(req.cache_control.is_some());
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        let nvext = chat_req.nvext.expect("nvext should be set");
+        let cc = nvext
+            .cache_control
+            .expect("nvext.cache_control should be set");
+        assert_eq!(cc.control_type, CacheControlType::Ephemeral);
+        assert_eq!(cc.ttl_seconds(), 3600);
+    }
+    #[test]
+    fn test_no_cache_control_passthrough() {
+        let req = AnthropicCreateMessageRequest {
+            model: "test-model".into(),
+            max_tokens: 100,
+            messages: vec![AnthropicMessage {
+                role: AnthropicRole::User,
+                content: AnthropicMessageContent::Text {
+                    content: "Hello".into(),
+                },
+            }],
+            system: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            stop_sequences: None,
+            stream: false,
+            metadata: None,
+            tools: None,
+            tool_choice: None,
+            cache_control: None,
+            thinking: None,
+            service_tier: None,
+            container: None,
+            output_config: None,
+        };
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        assert!(chat_req.nvext.is_none());
+    }
+    #[test]
+    fn test_per_block_cache_control_deserialization() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Hello", "cache_control": {"type": "ephemeral"}},
+                    {"type": "text", "text": "World"}
+                ]
+            }]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        match &req.messages[0].content {
+            AnthropicMessageContent::Blocks { content } => {
+                match &content[0] {
+                    AnthropicContentBlock::Text { cache_control, .. } => {
+                        assert!(cache_control.is_some());
+                    }
+                    other => panic!("expected Text, got {other:?}"),
+                }
+                match &content[1] {
+                    AnthropicContentBlock::Text { cache_control, .. } => {
+                        assert!(cache_control.is_none());
+                    }
+                    other => panic!("expected Text, got {other:?}"),
+                }
+            }
+            _ => panic!("expected blocks"),
+        }
+    }
+    #[test]
+    fn test_per_block_cache_control_last_wins() {
+        use crate::protocols::openai::nvext::CacheControlType;
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "system context", "cache_control": {"type": "ephemeral"}},
+                        {"type": "text", "text": "recent context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
+                    ]
+                }
+            ]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        let nvext = chat_req.nvext.expect("nvext should be set");
+        let cc = nvext.cache_control.expect("cache_control should be set");
+        assert_eq!(cc.control_type, CacheControlType::Ephemeral);
+        assert_eq!(cc.ttl_seconds(), 3600); // Last block's 1h TTL wins
+    }
+    #[test]
+    fn test_top_level_cache_control_overrides_per_block() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
+                    ]
+                }
+            ],
+            "cache_control": {"type": "ephemeral"}
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        let nvext = chat_req.nvext.expect("nvext should be set");
+        let cc = nvext.cache_control.expect("cache_control should be set");
+        // Top-level (no TTL = 300s default) takes precedence over per-block (1h)
+        assert_eq!(cc.ttl_seconds(), 300);
+    }
+    #[test]
+    fn test_system_block_array_with_cache_control() {
+        use crate::protocols::openai::nvext::CacheControlType;
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "system": [
+                {"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}},
+                {"type": "text", "text": "Be concise."}
+            ]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let system = req.system.as_ref().unwrap();
+        assert_eq!(system.text, "You are a helpful assistant.\nBe concise.");
+        // The LAST block with cache_control wins (first block here)
+        assert!(system.cache_control.is_some());
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        let nvext = chat_req
+            .nvext
+            .expect("nvext should be set from system cache_control");
+        let cc = nvext.cache_control.expect("cache_control should be set");
+        assert_eq!(cc.control_type, CacheControlType::Ephemeral);
+    }
+    #[test]
+    fn test_system_string_no_cache_control() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "system": "You are helpful."
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let system = req.system.as_ref().unwrap();
+        assert_eq!(system.text, "You are helpful.");
+        assert!(system.cache_control.is_none());
+    }
+    #[test]
+    fn test_text_block_with_citations() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "According to the document...",
+                        "citations": [
+                            {"type": "char_location", "cited_text": "relevant text", "document_index": 0, "start_char_index": 0, "end_char_index": 13}
+                        ]
+                    }
+                ]
+            }]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        match &req.messages[0].content {
+            AnthropicMessageContent::Blocks { content } => match &content[0] {
+                AnthropicContentBlock::Text { citations, .. } => {
+                    assert!(citations.is_some());
+                    let cites = citations.as_ref().unwrap();
+                    assert_eq!(cites.len(), 1);
+                    assert_eq!(cites[0]["type"], "char_location");
+                }
+                other => panic!("expected Text, got {other:?}"),
+            },
+            _ => panic!("expected blocks"),
+        }
+    }
+    #[test]
+    fn test_redacted_thinking_block() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{
+                "role": "assistant",
+                "content": [
+                    {"type": "thinking", "thinking": "visible reasoning", "signature": "sig1"},
+                    {"type": "redacted_thinking", "data": "base64-encrypted-data"},
+                    {"type": "text", "text": "Final answer"}
+                ]
+            }]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        match &req.messages[0].content {
+            AnthropicMessageContent::Blocks { content } => {
+                assert_eq!(content.len(), 3);
+                assert!(matches!(
+                    &content[0],
+                    AnthropicContentBlock::Thinking { .. }
+                ));
+                match &content[1] {
+                    AnthropicContentBlock::RedactedThinking { data } => {
+                        assert_eq!(data, "base64-encrypted-data");
+                    }
+                    other => panic!("expected RedactedThinking, got {other:?}"),
+                }
+                assert!(matches!(&content[2], AnthropicContentBlock::Text { .. }));
+            }
+            _ => panic!("expected blocks"),
+        }
+    }
+    #[test]
+    fn test_server_tool_use_and_web_search_result() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{
+                "role": "assistant",
+                "content": [
+                    {"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {"query": "rust programming"}},
+                    {"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://www.rust-lang.org", "title": "Rust"}]},
+                    {"type": "text", "text": "Based on my search..."}
+                ]
+            }]
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        match &req.messages[0].content {
+            AnthropicMessageContent::Blocks { content } => {
+                assert_eq!(content.len(), 3);
+                match &content[0] {
+                    AnthropicContentBlock::ServerToolUse { id, name, input } => {
+                        assert_eq!(id, "stu_1");
+                        assert_eq!(name, "web_search");
+                        assert_eq!(input["query"], "rust programming");
+                    }
+                    other => panic!("expected ServerToolUse, got {other:?}"),
+                }
+                match &content[1] {
+                    AnthropicContentBlock::WebSearchToolResult {
+                        tool_use_id,
+                        content,
+                    } => {
+                        assert_eq!(tool_use_id, "stu_1");
+                        assert!(content.is_array());
+                    }
+                    other => panic!("expected WebSearchToolResult, got {other:?}"),
+                }
+            }
+            _ => panic!("expected blocks"),
+        }
+        // ServerToolUse should convert to a tool call
+        let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
+        match &chat_req.inner.messages[0] {
+            ChatCompletionRequestMessage::Assistant(a) => {
+                let tc = a.tool_calls.as_ref().expect("should have tool calls");
+                assert_eq!(tc.len(), 1);
+                assert_eq!(tc[0].id, "stu_1");
+                assert_eq!(tc[0].function.name, "web_search");
+            }
+            other => panic!("expected assistant, got {other:?}"),
+        }
+    }
+    #[test]
+    fn test_thinking_config_deserialization() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 16000,
+            "messages": [{"role": "user", "content": "Solve this step by step"}],
+            "thinking": {"type": "enabled", "budget_tokens": 10000}
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let thinking = req.thinking.as_ref().expect("thinking should be set");
+        assert_eq!(thinking.thinking_type, "enabled");
+        assert_eq!(thinking.budget_tokens, Some(10000));
+    }
+    #[test]
+    fn test_thinking_config_disabled() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "thinking": {"type": "disabled"}
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        let thinking = req.thinking.as_ref().expect("thinking should be set");
+        assert_eq!(thinking.thinking_type, "disabled");
+        assert!(thinking.budget_tokens.is_none());
+    }
+    #[test]
+    fn test_disable_parallel_tool_use() {
+        let json = r#"{
+            "model": "test",
+            "max_tokens": 100,
+            "messages": [{"role": "user", "content": "Hello"}],
+            "tools": [{"name": "get_weather", "description": "Get weather", "input_schema": {"type": "object"}}],
+            "tool_choice": {"type": "auto", "disable_parallel_tool_use": true}
+        }"#;
+        let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
+        match &req.tool_choice {
+            Some(AnthropicToolChoice::Simple(s)) => {
+                assert_eq!(s.choice_type, AnthropicToolChoiceMode::Auto);
+                assert_eq!(s.disable_parallel_tool_use, Some(true));
+            }
+            other => panic!("expected Simple tool choice, got {other:?}"),
+        }
+    }
 }
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -92,6 +92,10 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
    fn raw_prompt(&self) -> Option<String> {
        None
    }
+    fn effective_cache_control(&self) -> Option<&crate::protocols::openai::nvext::CacheControl> {
+        NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
+    }
 }
 /// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,

--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -49,6 +49,13 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
 pub trait NvExtProvider {
    fn nvext(&self) -> Option<&NvExt>;
    fn raw_prompt(&self) -> Option<String>;
+    /// Return the effective cache control for this request.
+    /// Default: delegates to `nvext.cache_control`. Implementations may override
+    /// to also check a top-level `cache_control` field (see `NvCreateChatCompletionRequest`).
+    fn effective_cache_control(&self) -> Option<&CacheControl> {
+        self.nvext().and_then(|ext| ext.cache_control.as_ref())
+    }
 }
 /// Worker ID information for disaggregated serving