Unverified Commit 4d3e1ae3 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

feat: Full Anthropic Messages API cache_control support (top-level, per-block,...


feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629)
Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent a3cf35c3
...@@ -272,6 +272,9 @@ impl OpenAIPreprocessor { ...@@ -272,6 +272,9 @@ impl OpenAIPreprocessor {
builder.mdc_sum(Some(self.mdcsum.clone())); builder.mdc_sum(Some(self.mdcsum.clone()));
let lora_name = self.lora_name.clone(); let lora_name = self.lora_name.clone();
// Extract cache_control TTL from either nvext or top-level field
let cache_control_ttl = request.effective_cache_control().map(|cc| cc.ttl_seconds());
// Extract routing hints from nvext if present // Extract routing hints from nvext if present
if let Some(nvext) = request.nvext() { if let Some(nvext) = request.nvext() {
// Build routing hints from nvext fields // Build routing hints from nvext fields
...@@ -289,10 +292,12 @@ impl OpenAIPreprocessor { ...@@ -289,10 +292,12 @@ impl OpenAIPreprocessor {
allowed_worker_ids: None, allowed_worker_ids: None,
}; };
builder.routing(Some(routing)); builder.routing(Some(routing));
} else if lora_name.is_some() { } else if lora_name.is_some() || cache_control_ttl.is_some() {
// Ensure LoRA-aware routing still gets hints even when nvext is absent. // Ensure routing hints exist when we have LoRA or cache_control,
// even when nvext is absent (e.g. Anthropic endpoint requests).
builder.routing(Some(RoutingHints { builder.routing(Some(RoutingHints {
lora_name, lora_name,
cache_control_ttl,
..Default::default() ..Default::default()
})); }));
} }
......
...@@ -30,6 +30,7 @@ pub struct AnthropicStreamConverter { ...@@ -30,6 +30,7 @@ pub struct AnthropicStreamConverter {
// Token usage (from engine) // Token usage (from engine)
input_token_count: u32, input_token_count: u32,
output_token_count: u32, output_token_count: u32,
cached_token_count: Option<u32>,
// Tool call tracking // Tool call tracking
tool_call_states: Vec<ToolCallState>, tool_call_states: Vec<ToolCallState>,
tool_calls_sent: HashSet<String>, tool_calls_sent: HashSet<String>,
...@@ -57,6 +58,7 @@ impl AnthropicStreamConverter { ...@@ -57,6 +58,7 @@ impl AnthropicStreamConverter {
text_block_index: 0, text_block_index: 0,
input_token_count: 0, input_token_count: 0,
output_token_count: 0, output_token_count: 0,
cached_token_count: None,
tool_call_states: Vec::new(), tool_call_states: Vec::new(),
tool_calls_sent: HashSet::new(), tool_calls_sent: HashSet::new(),
next_block_index: 0, next_block_index: 0,
...@@ -77,6 +79,8 @@ impl AnthropicStreamConverter { ...@@ -77,6 +79,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage { usage: AnthropicUsage {
input_tokens: 0, input_tokens: 0,
output_tokens: 0, output_tokens: 0,
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
}, },
}; };
...@@ -95,6 +99,10 @@ impl AnthropicStreamConverter { ...@@ -95,6 +99,10 @@ impl AnthropicStreamConverter {
if let Some(usage) = &chunk.usage { if let Some(usage) = &chunk.usage {
self.input_token_count = usage.prompt_tokens; self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens; self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens);
} }
for choice in &chunk.choices { for choice in &chunk.choices {
...@@ -138,6 +146,7 @@ impl AnthropicStreamConverter { ...@@ -138,6 +146,7 @@ impl AnthropicStreamConverter {
index: self.text_block_index, index: self.text_block_index,
content_block: AnthropicResponseContentBlock::Text { content_block: AnthropicResponseContentBlock::Text {
text: String::new(), text: String::new(),
citations: None,
}, },
}; };
events.push(make_sse_event("content_block_start", &block_start)); events.push(make_sse_event("content_block_start", &block_start));
...@@ -271,6 +280,8 @@ impl AnthropicStreamConverter { ...@@ -271,6 +280,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage { usage: AnthropicUsage {
input_tokens: self.input_token_count, input_tokens: self.input_token_count,
output_tokens: self.output_token_count, output_tokens: self.output_token_count,
cache_creation_input_tokens: None,
cache_read_input_tokens: self.cached_token_count,
}, },
}; };
events.push(make_sse_event("message_delta", &message_delta)); events.push(make_sse_event("message_delta", &message_delta));
...@@ -329,6 +340,10 @@ impl AnthropicStreamConverter { ...@@ -329,6 +340,10 @@ impl AnthropicStreamConverter {
if let Some(usage) = &chunk.usage { if let Some(usage) = &chunk.usage {
self.input_token_count = usage.prompt_tokens; self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens; self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens);
} }
for choice in &chunk.choices { for choice in &chunk.choices {
...@@ -369,6 +384,7 @@ impl AnthropicStreamConverter { ...@@ -369,6 +384,7 @@ impl AnthropicStreamConverter {
index: self.text_block_index, index: self.text_block_index,
content_block: AnthropicResponseContentBlock::Text { content_block: AnthropicResponseContentBlock::Text {
text: String::new(), text: String::new(),
citations: None,
}, },
}; };
events.push(make_tagged_event("content_block_start", &ev)); events.push(make_tagged_event("content_block_start", &ev));
...@@ -483,6 +499,8 @@ impl AnthropicStreamConverter { ...@@ -483,6 +499,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage { usage: AnthropicUsage {
input_tokens: self.input_token_count, input_tokens: self.input_token_count,
output_tokens: self.output_token_count, output_tokens: self.output_token_count,
cache_creation_input_tokens: None,
cache_read_input_tokens: self.cached_token_count,
}, },
}; };
events.push(make_tagged_event("message_delta", &ev)); events.push(make_tagged_event("message_delta", &ev));
......
...@@ -22,15 +22,26 @@ use crate::protocols::openai::chat_completions::{ ...@@ -22,15 +22,26 @@ use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse, NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
}; };
use crate::protocols::openai::common_ext::CommonExt; use crate::protocols::openai::common_ext::CommonExt;
use crate::protocols::openai::nvext::{CacheControl, NvExt};
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
// Custom deserializers // Custom deserializers
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemContent {
/// The concatenated text from all system blocks (or the plain string).
pub text: String,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Deserialize `system` from either a plain string or an array of text blocks. /// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and /// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "..."}]`. /// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<String>, D::Error> fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
where where
D: serde::Deserializer<'de>, D: serde::Deserializer<'de>,
{ {
...@@ -44,16 +55,28 @@ where ...@@ -44,16 +55,28 @@ where
#[derive(Deserialize)] #[derive(Deserialize)]
struct SystemBlock { struct SystemBlock {
text: String, text: String,
#[serde(default)]
cache_control: Option<CacheControl>,
} }
let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?; let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
Ok(maybe.map(|sp| match sp { Ok(maybe.map(|sp| match sp {
SystemPrompt::Text(s) => s, SystemPrompt::Text(s) => SystemContent {
SystemPrompt::Blocks(blocks) => blocks text: s,
cache_control: None,
},
SystemPrompt::Blocks(blocks) => {
let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
let text = blocks
.into_iter() .into_iter()
.map(|b| b.text) .map(|b| b.text)
.collect::<Vec<_>>() .collect::<Vec<_>>()
.join("\n"), .join("\n");
SystemContent {
text,
cache_control,
}
}
})) }))
} }
...@@ -79,7 +102,7 @@ pub struct AnthropicCreateMessageRequest { ...@@ -79,7 +102,7 @@ pub struct AnthropicCreateMessageRequest {
skip_serializing_if = "Option::is_none", skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt" deserialize_with = "deserialize_system_prompt"
)] )]
pub system: Option<String>, pub system: Option<SystemContent>,
/// Sampling temperature (0.0 - 1.0). /// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
...@@ -112,6 +135,50 @@ pub struct AnthropicCreateMessageRequest { ...@@ -112,6 +135,50 @@ pub struct AnthropicCreateMessageRequest {
/// How the model should choose which tool to call. /// How the model should choose which tool to call.
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<AnthropicToolChoice>, pub tool_choice: Option<AnthropicToolChoice>,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be ≥ 1024 and < max_tokens).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingConfig>,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_tier: Option<String>,
/// Container identifier for stateful sandbox sessions.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_config: Option<serde_json::Value>,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThinkingConfig {
/// Either `"enabled"` or `"disabled"`.
#[serde(rename = "type")]
pub thinking_type: String,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if = "Option::is_none")]
pub budget_tokens: Option<u32>,
} }
/// A single message in the conversation. /// A single message in the conversation.
...@@ -143,15 +210,23 @@ pub enum AnthropicMessageContent { ...@@ -143,15 +210,23 @@ pub enum AnthropicMessageContent {
/// A single content block within a message. /// A single content block within a message.
/// ///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`, /// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Unknown` instead /// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude /// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle. /// Code may send block types that we don't yet handle.
#[derive(Debug, Clone, Serialize)] #[derive(Debug, Clone, Serialize)]
#[serde(tag = "type")] #[serde(tag = "type")]
pub enum AnthropicContentBlock { pub enum AnthropicContentBlock {
/// Text content block. /// Text content block. May optionally include `citations` — references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename = "text")] #[serde(rename = "text")]
Text { text: String }, Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Image content block. /// Image content block.
#[serde(rename = "image")] #[serde(rename = "image")]
Image { source: AnthropicImageSource }, Image { source: AnthropicImageSource },
...@@ -161,6 +236,8 @@ pub enum AnthropicContentBlock { ...@@ -161,6 +236,8 @@ pub enum AnthropicContentBlock {
id: String, id: String,
name: String, name: String,
input: serde_json::Value, input: serde_json::Value,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
}, },
/// Tool result from user. /// Tool result from user.
#[serde(rename = "tool_result")] #[serde(rename = "tool_result")]
...@@ -170,14 +247,45 @@ pub enum AnthropicContentBlock { ...@@ -170,14 +247,45 @@ pub enum AnthropicContentBlock {
content: Option<ToolResultContent>, content: Option<ToolResultContent>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
is_error: Option<bool>, is_error: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
}, },
/// Thinking content block from assistant (extended thinking / reasoning). /// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename = "thinking")] #[serde(rename = "thinking")]
Thinking { thinking: String, signature: String }, Thinking {
/// Catch-all for unrecognized block types. Silently accepted and skipped thinking: String,
/// during conversion so that new Anthropic features don't break the endpoint. signature: String,
#[serde(skip)] #[serde(default, skip_serializing_if = "Option::is_none")]
Unknown { block_type: String }, cache_control: Option<CacheControl>,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other(serde_json::Value),
} }
/// Content of a `tool_result` block — either a plain string or an array of /// Content of a `tool_result` block — either a plain string or an array of
...@@ -237,9 +345,21 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock { ...@@ -237,9 +345,21 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
let text = value let text = value
.get("text") .get("text")
.and_then(|t| t.as_str()) .and_then(|t| t.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("text"))?
.to_string(); .to_string();
Ok(AnthropicContentBlock::Text { text }) let citations: Option<Vec<serde_json::Value>> = value
.get("citations")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Text {
text,
citations,
cache_control,
})
} }
"image" => { "image" => {
let source: AnthropicImageSource = let source: AnthropicImageSource =
...@@ -251,55 +371,112 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock { ...@@ -251,55 +371,112 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
let id = value let id = value
.get("id") .get("id")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string(); .to_string();
let name = value let name = value
.get("name") .get("name")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string(); .to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({})); let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ToolUse { id, name, input }) let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolUse {
id,
name,
input,
cache_control,
})
} }
"tool_result" => { "tool_result" => {
let tool_use_id = value let tool_use_id = value
.get("tool_use_id") .get("tool_use_id")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string(); .to_string();
let content: Option<ToolResultContent> = value let content: Option<ToolResultContent> = value
.get("content") .get("content")
.cloned() .cloned()
.and_then(|v| serde_json::from_value(v).ok()); .and_then(|v| serde_json::from_value(v).ok());
let is_error = value.get("is_error").and_then(|v| v.as_bool()); let is_error = value.get("is_error").and_then(|v| v.as_bool());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolResult { Ok(AnthropicContentBlock::ToolResult {
tool_use_id, tool_use_id,
content, content,
is_error, is_error,
cache_control,
}) })
} }
"thinking" => { "thinking" => {
let thinking = value let thinking = value
.get("thinking") .get("thinking")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("thinking"))?
.to_string(); .to_string();
let signature = value let signature = value
.get("signature") .get("signature")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.unwrap_or("") .ok_or_else(|| serde::de::Error::missing_field("signature"))?
.to_string(); .to_string();
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Thinking { Ok(AnthropicContentBlock::Thinking {
thinking, thinking,
signature, signature,
cache_control,
}) })
} }
other => { "redacted_thinking" => {
tracing::debug!("Unknown Anthropic content block type '{}', skipping", other); let data = value
Ok(AnthropicContentBlock::Unknown { .get("data")
block_type: other.to_string(), .and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("data"))?
.to_string();
Ok(AnthropicContentBlock::RedactedThinking { data })
}
"server_tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
}
"web_search_tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content = value
.get("content")
.cloned()
.unwrap_or(serde_json::json!([]));
Ok(AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
}) })
} }
other => {
tracing::debug!(
"Unrecognized Anthropic content block type '{}', preserving as Other",
other
);
Ok(AnthropicContentBlock::Other(value))
}
} }
} }
} }
...@@ -314,12 +491,29 @@ pub struct AnthropicImageSource { ...@@ -314,12 +491,29 @@ pub struct AnthropicImageSource {
} }
/// A tool definition. /// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicTool { pub struct AnthropicTool {
/// Tool name (required for client tools, present on server tools too).
pub name: String, pub name: String,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
pub tool_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>, pub description: Option<String>,
pub input_schema: serde_json::Value, /// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_schema: Option<serde_json::Value>,
/// Cache control breakpoint on this tool definition.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
} }
/// Tool choice specification. /// Tool choice specification.
...@@ -338,6 +532,10 @@ pub enum AnthropicToolChoice { ...@@ -338,6 +532,10 @@ pub enum AnthropicToolChoice {
pub struct AnthropicToolChoiceSimple { pub struct AnthropicToolChoiceSimple {
#[serde(rename = "type")] #[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode, pub choice_type: AnthropicToolChoiceMode,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
} }
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
...@@ -355,6 +553,10 @@ pub struct AnthropicToolChoiceNamed { ...@@ -355,6 +553,10 @@ pub struct AnthropicToolChoiceNamed {
#[serde(rename = "type")] #[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode, pub choice_type: AnthropicToolChoiceMode,
pub name: String, pub name: String,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
...@@ -376,17 +578,47 @@ pub struct AnthropicMessageResponse { ...@@ -376,17 +578,47 @@ pub struct AnthropicMessageResponse {
} }
/// A content block in the response. /// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")] #[serde(tag = "type")]
pub enum AnthropicResponseContentBlock { pub enum AnthropicResponseContentBlock {
#[serde(rename = "text")] #[serde(rename = "text")]
Text { text: String }, Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
},
#[serde(rename = "tool_use")] #[serde(rename = "tool_use")]
ToolUse { ToolUse {
id: String, id: String,
name: String, name: String,
input: serde_json::Value, input: serde_json::Value,
}, },
#[serde(rename = "thinking")]
Thinking { thinking: String, signature: String },
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other(serde_json::Value),
} }
/// Token usage information. /// Token usage information.
...@@ -394,6 +626,12 @@ pub enum AnthropicResponseContentBlock { ...@@ -394,6 +626,12 @@ pub enum AnthropicResponseContentBlock {
pub struct AnthropicUsage { pub struct AnthropicUsage {
pub input_tokens: u32, pub input_tokens: u32,
pub output_tokens: u32, pub output_tokens: u32,
/// Number of input tokens used to create a new cache entry.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_creation_input_tokens: Option<u32>,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_read_input_tokens: Option<u32>,
} }
/// Reason the model stopped generating. /// Reason the model stopped generating.
...@@ -404,6 +642,11 @@ pub enum AnthropicStopReason { ...@@ -404,6 +642,11 @@ pub enum AnthropicStopReason {
MaxTokens, MaxTokens,
StopSequence, StopSequence,
ToolUse, ToolUse,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn,
/// The model refused to generate content (safety refusal).
Refusal,
} }
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
...@@ -453,6 +696,15 @@ pub enum AnthropicDelta { ...@@ -453,6 +696,15 @@ pub enum AnthropicDelta {
TextDelta { text: String }, TextDelta { text: String },
#[serde(rename = "input_json_delta")] #[serde(rename = "input_json_delta")]
InputJsonDelta { partial_json: String }, InputJsonDelta { partial_json: String },
/// Incremental thinking content during extended thinking streaming.
#[serde(rename = "thinking_delta")]
ThinkingDelta { thinking: String },
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename = "signature_delta")]
SignatureDelta { signature: String },
/// Incremental citation attached to a text block.
#[serde(rename = "citations_delta")]
CitationsDelta { citation: serde_json::Value },
} }
/// The delta body in a message_delta event. /// The delta body in a message_delta event.
...@@ -529,10 +781,12 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest { ...@@ -529,10 +781,12 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
let mut messages = Vec::new(); let mut messages = Vec::new();
// Prepend system message if present // Prepend system message if present
if let Some(system_text) = &req.system { if let Some(system_content) = &req.system {
messages.push(ChatCompletionRequestMessage::System( messages.push(ChatCompletionRequestMessage::System(
ChatCompletionRequestSystemMessage { ChatCompletionRequestSystemMessage {
content: ChatCompletionRequestSystemMessageContent::Text(system_text.clone()), content: ChatCompletionRequestSystemMessageContent::Text(
system_content.text.clone(),
),
name: None, name: None,
}, },
)); ));
...@@ -610,7 +864,41 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest { ...@@ -610,7 +864,41 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
top_k: req.top_k.map(|k| k as i32), top_k: req.top_k.map(|k| k as i32),
..Default::default() ..Default::default()
}, },
nvext: None, nvext: {
// Collect per-block cache_control: use the last one found
let mut last_block_cc: Option<CacheControl> = None;
for msg in &req.messages {
if let AnthropicMessageContent::Blocks { content } = &msg.content {
for block in content {
let block_cc = match block {
AnthropicContentBlock::Text { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::ToolUse { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::ToolResult { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::Thinking { cache_control, .. } => {
cache_control.as_ref()
}
_ => None,
};
if let Some(cc) = block_cc {
last_block_cc = Some(cc.clone());
}
}
}
}
// Merge: top-level > per-block > system block cache_control
let system_cc = req.system.as_ref().and_then(|s| s.cache_control.clone());
let effective_cc = req.cache_control.clone().or(last_block_cc).or(system_cc);
effective_cc.map(|cc| NvExt {
cache_control: Some(cc),
..Default::default()
})
},
chat_template_args: None, chat_template_args: None,
media_io_kwargs: None, media_io_kwargs: None,
unsupported_fields: Default::default(), unsupported_fields: Default::default(),
...@@ -629,7 +917,7 @@ fn convert_user_blocks( ...@@ -629,7 +917,7 @@ fn convert_user_blocks(
for block in blocks { for block in blocks {
match block { match block {
AnthropicContentBlock::Text { text } => { AnthropicContentBlock::Text { text, .. } => {
text_parts.push(text.clone()); text_parts.push(text.clone());
} }
AnthropicContentBlock::ToolResult { AnthropicContentBlock::ToolResult {
...@@ -664,8 +952,11 @@ fn convert_user_blocks( ...@@ -664,8 +952,11 @@ fn convert_user_blocks(
} }
AnthropicContentBlock::ToolUse { .. } AnthropicContentBlock::ToolUse { .. }
| AnthropicContentBlock::Thinking { .. } | AnthropicContentBlock::Thinking { .. }
| AnthropicContentBlock::Unknown { .. } => { | AnthropicContentBlock::RedactedThinking { .. }
// tool_use/thinking/unknown in a user message: skip | AnthropicContentBlock::ServerToolUse { .. }
| AnthropicContentBlock::WebSearchToolResult { .. }
| AnthropicContentBlock::Other(_) => {
// tool_use/thinking/server-side blocks/unknown in a user message: skip
} }
} }
} }
...@@ -715,7 +1006,7 @@ fn convert_assistant_blocks( ...@@ -715,7 +1006,7 @@ fn convert_assistant_blocks(
for block in blocks { for block in blocks {
match block { match block {
AnthropicContentBlock::Text { text } => { AnthropicContentBlock::Text { text, .. } => {
text_content.push_str(text); text_content.push_str(text);
} }
AnthropicContentBlock::Thinking { thinking, .. } => { AnthropicContentBlock::Thinking { thinking, .. } => {
...@@ -724,8 +1015,21 @@ fn convert_assistant_blocks( ...@@ -724,8 +1015,21 @@ fn convert_assistant_blocks(
} }
pending_reasoning.push_str(thinking); pending_reasoning.push_str(thinking);
} }
AnthropicContentBlock::ToolUse { id, name, input } => { AnthropicContentBlock::RedactedThinking { .. } => {
// Redacted thinking is encrypted model reasoning. We can't read
// it but we preserve its position so it's not silently dropped.
// The actual encrypted data would need to be passed back to the
// model in multi-turn conversations for context continuity.
}
AnthropicContentBlock::ToolUse {
id, name, input, ..
}
| AnthropicContentBlock::ServerToolUse {
id, name, input, ..
} => {
// Snapshot the reasoning that preceded this tool call. // Snapshot the reasoning that preceded this tool call.
// Server-initiated tool use (e.g. web search) is treated the
// same as client tool use for conversion purposes.
segments.push(std::mem::take(&mut pending_reasoning)); segments.push(std::mem::take(&mut pending_reasoning));
tool_calls.push(ChatCompletionMessageToolCall { tool_calls.push(ChatCompletionMessageToolCall {
id: id.clone(), id: id.clone(),
...@@ -798,15 +1102,28 @@ fn convert_assistant_blocks( ...@@ -798,15 +1102,28 @@ fn convert_assistant_blocks(
fn convert_anthropic_tools(tools: &[AnthropicTool]) -> Vec<ChatCompletionTool> { fn convert_anthropic_tools(tools: &[AnthropicTool]) -> Vec<ChatCompletionTool> {
tools tools
.iter() .iter()
.map(|tool| ChatCompletionTool { .filter_map(|tool| {
// Server tools (web_search, bash, etc.) don't have input_schema
// and can't be meaningfully converted to OpenAI function tools.
// They are backend-specific and handled separately.
let schema = tool.input_schema.clone().or_else(|| {
tracing::debug!(
tool_name = %tool.name,
tool_type = ?tool.tool_type,
"Skipping server tool in OpenAI conversion (no input_schema)"
);
None
})?;
Some(ChatCompletionTool {
r#type: ChatCompletionToolType::Function, r#type: ChatCompletionToolType::Function,
function: FunctionObject { function: FunctionObject {
name: tool.name.clone(), name: tool.name.clone(),
description: tool.description.clone(), description: tool.description.clone(),
parameters: Some(tool.input_schema.clone()), parameters: Some(schema),
strict: None, strict: None,
}, },
}) })
})
.collect() .collect()
} }
...@@ -877,6 +1194,20 @@ pub fn chat_completion_to_anthropic_response( ...@@ -877,6 +1194,20 @@ pub fn chat_completion_to_anthropic_response(
} }
} }
// Extract reasoning content (from --dyn-reasoning-parser, e.g. qwen3).
// The backend strips <think>...</think> from the text and surfaces it
// as reasoning_content on the message. Map this to a Thinking block
// so clients see proper extended thinking in the Anthropic response.
if let Some(thinking) = choice.message.reasoning_content.filter(|t| !t.is_empty()) {
content.insert(
0,
AnthropicResponseContentBlock::Thinking {
thinking,
signature: String::new(),
},
);
}
// Extract text content // Extract text content
let text = match choice.message.content { let text = match choice.message.content {
Some(dynamo_async_openai::types::ChatCompletionMessageContent::Text(t)) => Some(t), Some(dynamo_async_openai::types::ChatCompletionMessageContent::Text(t)) => Some(t),
...@@ -889,8 +1220,11 @@ pub fn chat_completion_to_anthropic_response( ...@@ -889,8 +1220,11 @@ pub fn chat_completion_to_anthropic_response(
None => None, None => None,
}; };
if let Some(text) = text { if let Some(text) = text {
// Text goes first in the content array // Text goes after thinking block (if any)
content.insert(0, AnthropicResponseContentBlock::Text { text }); content.push(AnthropicResponseContentBlock::Text {
text,
citations: None,
});
} }
} }
...@@ -898,15 +1232,24 @@ pub fn chat_completion_to_anthropic_response( ...@@ -898,15 +1232,24 @@ pub fn chat_completion_to_anthropic_response(
if content.is_empty() { if content.is_empty() {
content.push(AnthropicResponseContentBlock::Text { content.push(AnthropicResponseContentBlock::Text {
text: String::new(), text: String::new(),
citations: None,
}); });
} }
// Map usage // Map usage
let usage = chat_resp let usage = chat_resp
.usage .usage
.map(|u| AnthropicUsage { .map(|u| {
let cache_read_input_tokens = u
.prompt_tokens_details
.and_then(|d| d.cached_tokens)
.filter(|&n| n > 0);
AnthropicUsage {
input_tokens: u.prompt_tokens, input_tokens: u.prompt_tokens,
output_tokens: u.completion_tokens, output_tokens: u.completion_tokens,
cache_creation_input_tokens: None, // Not available from OpenAI format
cache_read_input_tokens,
}
}) })
.unwrap_or_default(); .unwrap_or_default();
...@@ -936,7 +1279,7 @@ pub struct AnthropicCountTokensRequest { ...@@ -936,7 +1279,7 @@ pub struct AnthropicCountTokensRequest {
skip_serializing_if = "Option::is_none", skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt" deserialize_with = "deserialize_system_prompt"
)] )]
pub system: Option<String>, pub system: Option<SystemContent>,
#[serde(default)] #[serde(default)]
pub tools: Option<Vec<AnthropicTool>>, pub tools: Option<Vec<AnthropicTool>>,
} }
...@@ -953,7 +1296,7 @@ impl AnthropicCountTokensRequest { ...@@ -953,7 +1296,7 @@ impl AnthropicCountTokensRequest {
let mut total_len: usize = 0; let mut total_len: usize = 0;
if let Some(system) = &self.system { if let Some(system) = &self.system {
total_len += system.len(); total_len += system.text.len();
} }
for msg in &self.messages { for msg in &self.messages {
...@@ -979,7 +1322,9 @@ impl AnthropicCountTokensRequest { ...@@ -979,7 +1322,9 @@ impl AnthropicCountTokensRequest {
if let Some(desc) = &tool.description { if let Some(desc) = &tool.description {
total_len += desc.len(); total_len += desc.len();
} }
total_len += tool.input_schema.to_string().len(); if let Some(schema) = &tool.input_schema {
total_len += schema.to_string().len();
}
} }
} }
...@@ -994,7 +1339,7 @@ impl AnthropicCountTokensRequest { ...@@ -994,7 +1339,7 @@ impl AnthropicCountTokensRequest {
fn estimate_block_len(block: &AnthropicContentBlock) -> usize { fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
match block { match block {
AnthropicContentBlock::Text { text } => text.len(), AnthropicContentBlock::Text { text, .. } => text.len(),
AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(), AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
AnthropicContentBlock::ToolResult { content, .. } => content AnthropicContentBlock::ToolResult { content, .. } => content
.as_ref() .as_ref()
...@@ -1010,8 +1355,13 @@ fn estimate_block_len(block: &AnthropicContentBlock) -> usize { ...@@ -1010,8 +1355,13 @@ fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
}) })
.unwrap_or(0), .unwrap_or(0),
AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(), AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
AnthropicContentBlock::ServerToolUse { name, input, .. } => {
name.len() + input.to_string().len()
}
AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
AnthropicContentBlock::Unknown { .. } => 0, AnthropicContentBlock::Other(v) => v.to_string().len(),
} }
} }
...@@ -1043,6 +1393,11 @@ mod tests { ...@@ -1043,6 +1393,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1073,7 +1428,10 @@ mod tests { ...@@ -1073,7 +1428,10 @@ mod tests {
content: "Hi".into(), content: "Hi".into(),
}, },
}], }],
system: Some("You are helpful.".into()), system: Some(SystemContent {
text: "You are helpful.".into(),
cache_control: None,
}),
temperature: None, temperature: None,
top_p: None, top_p: None,
top_k: None, top_k: None,
...@@ -1082,6 +1440,11 @@ mod tests { ...@@ -1082,6 +1440,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1115,6 +1478,7 @@ mod tests { ...@@ -1115,6 +1478,7 @@ mod tests {
id: "tool_123".into(), id: "tool_123".into(),
name: "get_weather".into(), name: "get_weather".into(),
input: serde_json::json!({"location": "SF"}), input: serde_json::json!({"location": "SF"}),
cache_control: None,
}], }],
}, },
}, },
...@@ -1125,6 +1489,7 @@ mod tests { ...@@ -1125,6 +1489,7 @@ mod tests {
tool_use_id: "tool_123".into(), tool_use_id: "tool_123".into(),
content: Some(ToolResultContent::Text("72F and sunny".into())), content: Some(ToolResultContent::Text("72F and sunny".into())),
is_error: None, is_error: None,
cache_control: None,
}], }],
}, },
}, },
...@@ -1138,6 +1503,11 @@ mod tests { ...@@ -1138,6 +1503,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1176,6 +1546,11 @@ mod tests { ...@@ -1176,6 +1546,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1202,16 +1577,24 @@ mod tests { ...@@ -1202,16 +1577,24 @@ mod tests {
metadata: None, metadata: None,
tools: Some(vec![AnthropicTool { tools: Some(vec![AnthropicTool {
name: "get_weather".into(), name: "get_weather".into(),
tool_type: None,
description: Some("Get weather info".into()), description: Some("Get weather info".into()),
input_schema: serde_json::json!({ input_schema: Some(serde_json::json!({
"type": "object", "type": "object",
"properties": {"location": {"type": "string"}}, "properties": {"location": {"type": "string"}},
"required": ["location"] "required": ["location"]
}), })),
cache_control: None,
}]), }]),
tool_choice: Some(AnthropicToolChoice::Simple(AnthropicToolChoiceSimple { tool_choice: Some(AnthropicToolChoice::Simple(AnthropicToolChoiceSimple {
choice_type: AnthropicToolChoiceMode::Auto, choice_type: AnthropicToolChoiceMode::Auto,
disable_parallel_tool_use: None,
})), })),
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1274,7 +1657,7 @@ mod tests { ...@@ -1274,7 +1657,7 @@ mod tests {
assert_eq!(response.usage.output_tokens, 5); assert_eq!(response.usage.output_tokens, 5);
assert_eq!(response.content.len(), 1); assert_eq!(response.content.len(), 1);
match &response.content[0] { match &response.content[0] {
AnthropicResponseContentBlock::Text { text } => { AnthropicResponseContentBlock::Text { text, .. } => {
assert_eq!(text, "Hello!"); assert_eq!(text, "Hello!");
} }
_ => panic!("expected text block"), _ => panic!("expected text block"),
...@@ -1335,6 +1718,7 @@ mod tests { ...@@ -1335,6 +1718,7 @@ mod tests {
AnthropicContentBlock::Thinking { AnthropicContentBlock::Thinking {
thinking, thinking,
signature, signature,
..
} => { } => {
assert_eq!(thinking, "Let me reason about this..."); assert_eq!(thinking, "Let me reason about this...");
assert_eq!(signature, "sig123"); assert_eq!(signature, "sig123");
...@@ -1358,9 +1742,12 @@ mod tests { ...@@ -1358,9 +1742,12 @@ mod tests {
AnthropicContentBlock::Thinking { AnthropicContentBlock::Thinking {
thinking: "I should think...".into(), thinking: "I should think...".into(),
signature: "sig".into(), signature: "sig".into(),
cache_control: None,
}, },
AnthropicContentBlock::Text { AnthropicContentBlock::Text {
text: "Answer".into(), text: "Answer".into(),
citations: None,
cache_control: None,
}, },
], ],
}, },
...@@ -1374,6 +1761,11 @@ mod tests { ...@@ -1374,6 +1761,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
...@@ -1395,7 +1787,7 @@ mod tests { ...@@ -1395,7 +1787,7 @@ mod tests {
} }
#[test] #[test]
fn test_unknown_block_type_does_not_fail() { fn test_known_and_unknown_block_types() {
let json = r#"{ let json = r#"{
"model": "test", "model": "test",
"max_tokens": 100, "max_tokens": 100,
...@@ -1405,6 +1797,8 @@ mod tests { ...@@ -1405,6 +1797,8 @@ mod tests {
{"type": "text", "text": "hello"}, {"type": "text", "text": "hello"},
{"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {}}, {"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {}},
{"type": "redacted_thinking", "data": "encrypted"}, {"type": "redacted_thinking", "data": "encrypted"},
{"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://example.com"}]},
{"type": "future_block_type", "some_field": 42},
{"type": "text", "text": "world"} {"type": "text", "text": "world"}
] ]
}] }]
...@@ -1412,22 +1806,32 @@ mod tests { ...@@ -1412,22 +1806,32 @@ mod tests {
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap(); let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content { match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => { AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 4); assert_eq!(content.len(), 6);
assert!(matches!(&content[0], AnthropicContentBlock::Text { .. })); assert!(matches!(&content[0], AnthropicContentBlock::Text { .. }));
assert!(matches!( assert!(matches!(
&content[1], &content[1],
AnthropicContentBlock::Unknown { block_type } if block_type == "server_tool_use" AnthropicContentBlock::ServerToolUse { name, .. } if name == "web_search"
)); ));
assert!(matches!( assert!(matches!(
&content[2], &content[2],
AnthropicContentBlock::Unknown { block_type } if block_type == "redacted_thinking" AnthropicContentBlock::RedactedThinking { data } if data == "encrypted"
));
assert!(matches!(
&content[3],
AnthropicContentBlock::WebSearchToolResult { tool_use_id, .. } if tool_use_id == "stu_1"
));
// Truly unknown types still fall through to Other with full JSON preserved
assert!(matches!(
&content[4],
AnthropicContentBlock::Other(v) if v.get("type").and_then(|t| t.as_str()) == Some("future_block_type")
)); ));
assert!(matches!(&content[3], AnthropicContentBlock::Text { .. })); assert!(matches!(&content[5], AnthropicContentBlock::Text { .. }));
} }
_ => panic!("expected blocks content"), _ => panic!("expected blocks content"),
} }
// Conversion should succeed, skipping unknown blocks // Conversion should succeed — server_tool_use becomes a tool call,
// redacted_thinking and web_search_tool_result are preserved gracefully
let chat_req: NvCreateChatCompletionRequest = AnthropicCreateMessageRequest { let chat_req: NvCreateChatCompletionRequest = AnthropicCreateMessageRequest {
model: "test".into(), model: "test".into(),
max_tokens: 100, max_tokens: 100,
...@@ -1441,10 +1845,25 @@ mod tests { ...@@ -1441,10 +1845,25 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
} }
.try_into() .try_into()
.unwrap(); .unwrap();
// server_tool_use becomes a tool call on the assistant message
assert_eq!(chat_req.inner.messages.len(), 1); assert_eq!(chat_req.inner.messages.len(), 1);
match &chat_req.inner.messages[0] {
ChatCompletionRequestMessage::Assistant(a) => {
assert!(a.tool_calls.is_some());
let tc = a.tool_calls.as_ref().unwrap();
assert_eq!(tc.len(), 1);
assert_eq!(tc[0].function.name, "web_search");
}
other => panic!("expected assistant, got {other:?}"),
}
} }
#[test] #[test]
...@@ -1510,7 +1929,10 @@ mod tests { ...@@ -1510,7 +1929,10 @@ mod tests {
content: "Hello, world! This is a test message.".into(), content: "Hello, world! This is a test message.".into(),
}, },
}], }],
system: Some("You are helpful.".into()), system: Some(SystemContent {
text: "You are helpful.".into(),
cache_control: None,
}),
tools: None, tools: None,
}; };
...@@ -1539,6 +1961,11 @@ mod tests { ...@@ -1539,6 +1961,11 @@ mod tests {
metadata: None, metadata: None,
tools: None, tools: None,
tool_choice: None, tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}; };
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap(); let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match chat_req.inner.messages.into_iter().next().unwrap() { match chat_req.inner.messages.into_iter().next().unwrap() {
...@@ -1552,6 +1979,7 @@ mod tests { ...@@ -1552,6 +1979,7 @@ mod tests {
id: id.into(), id: id.into(),
name: "fn".into(), name: "fn".into(),
input: serde_json::json!({}), input: serde_json::json!({}),
cache_control: None,
} }
} }
...@@ -1559,6 +1987,7 @@ mod tests { ...@@ -1559,6 +1987,7 @@ mod tests {
AnthropicContentBlock::Thinking { AnthropicContentBlock::Thinking {
thinking: text.into(), thinking: text.into(),
signature: "sig".into(), signature: "sig".into(),
cache_control: None,
} }
} }
...@@ -1686,6 +2115,8 @@ mod tests { ...@@ -1686,6 +2115,8 @@ mod tests {
thinking("A"), thinking("A"),
AnthropicContentBlock::Text { AnthropicContentBlock::Text {
text: "answer".into(), text: "answer".into(),
citations: None,
cache_control: None,
}, },
]); ]);
...@@ -1776,4 +2207,387 @@ mod tests { ...@@ -1776,4 +2207,387 @@ mod tests {
assert_eq!(tools[0].id, "t1"); assert_eq!(tools[0].id, "t1");
assert_eq!(tools[1].id, "t2"); assert_eq!(tools[1].id, "t2");
} }
#[test]
fn test_cache_control_passthrough() {
use crate::protocols::openai::nvext::{CacheControl, CacheControlType};
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: Some(CacheControl {
control_type: CacheControlType::Ephemeral,
ttl: None,
}),
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext
.cache_control
.expect("nvext.cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 300);
}
#[test]
fn test_cache_control_1h_ttl_passthrough() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"cache_control": {"type": "ephemeral", "ttl": "1h"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
assert!(req.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext
.cache_control
.expect("nvext.cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 3600);
}
#[test]
fn test_no_cache_control_passthrough() {
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_per_block_cache_control_deserialization() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": "Hello", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "World"}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
match &content[0] {
AnthropicContentBlock::Text { cache_control, .. } => {
assert!(cache_control.is_some());
}
other => panic!("expected Text, got {other:?}"),
}
match &content[1] {
AnthropicContentBlock::Text { cache_control, .. } => {
assert!(cache_control.is_none());
}
other => panic!("expected Text, got {other:?}"),
}
}
_ => panic!("expected blocks"),
}
}
#[test]
fn test_per_block_cache_control_last_wins() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "system context", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "recent context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext.cache_control.expect("cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 3600); // Last block's 1h TTL wins
}
#[test]
fn test_top_level_cache_control_overrides_per_block() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
],
"cache_control": {"type": "ephemeral"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext.cache_control.expect("cache_control should be set");
// Top-level (no TTL = 300s default) takes precedence over per-block (1h)
assert_eq!(cc.ttl_seconds(), 300);
}
#[test]
fn test_system_block_array_with_cache_control() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": [
{"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "Be concise."}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let system = req.system.as_ref().unwrap();
assert_eq!(system.text, "You are a helpful assistant.\nBe concise.");
// The LAST block with cache_control wins (first block here)
assert!(system.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req
.nvext
.expect("nvext should be set from system cache_control");
let cc = nvext.cache_control.expect("cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
}
#[test]
fn test_system_string_no_cache_control() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": "You are helpful."
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let system = req.system.as_ref().unwrap();
assert_eq!(system.text, "You are helpful.");
assert!(system.cache_control.is_none());
}
#[test]
fn test_text_block_with_citations() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{
"type": "text",
"text": "According to the document...",
"citations": [
{"type": "char_location", "cited_text": "relevant text", "document_index": 0, "start_char_index": 0, "end_char_index": 13}
]
}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => match &content[0] {
AnthropicContentBlock::Text { citations, .. } => {
assert!(citations.is_some());
let cites = citations.as_ref().unwrap();
assert_eq!(cites.len(), 1);
assert_eq!(cites[0]["type"], "char_location");
}
other => panic!("expected Text, got {other:?}"),
},
_ => panic!("expected blocks"),
}
}
#[test]
fn test_redacted_thinking_block() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{"type": "thinking", "thinking": "visible reasoning", "signature": "sig1"},
{"type": "redacted_thinking", "data": "base64-encrypted-data"},
{"type": "text", "text": "Final answer"}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 3);
assert!(matches!(
&content[0],
AnthropicContentBlock::Thinking { .. }
));
match &content[1] {
AnthropicContentBlock::RedactedThinking { data } => {
assert_eq!(data, "base64-encrypted-data");
}
other => panic!("expected RedactedThinking, got {other:?}"),
}
assert!(matches!(&content[2], AnthropicContentBlock::Text { .. }));
}
_ => panic!("expected blocks"),
}
}
#[test]
fn test_server_tool_use_and_web_search_result() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {"query": "rust programming"}},
{"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://www.rust-lang.org", "title": "Rust"}]},
{"type": "text", "text": "Based on my search..."}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 3);
match &content[0] {
AnthropicContentBlock::ServerToolUse { id, name, input } => {
assert_eq!(id, "stu_1");
assert_eq!(name, "web_search");
assert_eq!(input["query"], "rust programming");
}
other => panic!("expected ServerToolUse, got {other:?}"),
}
match &content[1] {
AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
} => {
assert_eq!(tool_use_id, "stu_1");
assert!(content.is_array());
}
other => panic!("expected WebSearchToolResult, got {other:?}"),
}
}
_ => panic!("expected blocks"),
}
// ServerToolUse should convert to a tool call
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match &chat_req.inner.messages[0] {
ChatCompletionRequestMessage::Assistant(a) => {
let tc = a.tool_calls.as_ref().expect("should have tool calls");
assert_eq!(tc.len(), 1);
assert_eq!(tc[0].id, "stu_1");
assert_eq!(tc[0].function.name, "web_search");
}
other => panic!("expected assistant, got {other:?}"),
}
}
#[test]
fn test_thinking_config_deserialization() {
let json = r#"{
"model": "test",
"max_tokens": 16000,
"messages": [{"role": "user", "content": "Solve this step by step"}],
"thinking": {"type": "enabled", "budget_tokens": 10000}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let thinking = req.thinking.as_ref().expect("thinking should be set");
assert_eq!(thinking.thinking_type, "enabled");
assert_eq!(thinking.budget_tokens, Some(10000));
}
#[test]
fn test_thinking_config_disabled() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"thinking": {"type": "disabled"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let thinking = req.thinking.as_ref().expect("thinking should be set");
assert_eq!(thinking.thinking_type, "disabled");
assert!(thinking.budget_tokens.is_none());
}
#[test]
fn test_disable_parallel_tool_use() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"tools": [{"name": "get_weather", "description": "Get weather", "input_schema": {"type": "object"}}],
"tool_choice": {"type": "auto", "disable_parallel_tool_use": true}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.tool_choice {
Some(AnthropicToolChoice::Simple(s)) => {
assert_eq!(s.choice_type, AnthropicToolChoiceMode::Auto);
assert_eq!(s.disable_parallel_tool_use, Some(true));
}
other => panic!("expected Simple tool choice, got {other:?}"),
}
}
} }
...@@ -92,6 +92,10 @@ impl NvExtProvider for NvCreateChatCompletionRequest { ...@@ -92,6 +92,10 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
fn raw_prompt(&self) -> Option<String> { fn raw_prompt(&self) -> Option<String> {
None None
} }
fn effective_cache_control(&self) -> Option<&crate::protocols::openai::nvext::CacheControl> {
NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
}
} }
/// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`, /// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,
......
...@@ -49,6 +49,13 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap) ...@@ -49,6 +49,13 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
pub trait NvExtProvider { pub trait NvExtProvider {
fn nvext(&self) -> Option<&NvExt>; fn nvext(&self) -> Option<&NvExt>;
fn raw_prompt(&self) -> Option<String>; fn raw_prompt(&self) -> Option<String>;
/// Return the effective cache control for this request.
/// Default: delegates to `nvext.cache_control`. Implementations may override
/// to also check a top-level `cache_control` field (see `NvCreateChatCompletionRequest`).
fn effective_cache_control(&self) -> Option<&CacheControl> {
self.nvext().and_then(|ext| ext.cache_control.as_ref())
}
} }
/// Worker ID information for disaggregated serving /// Worker ID information for disaggregated serving
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment