Unverified Commit 4d3e1ae3 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

feat: Full Anthropic Messages API cache_control support (top-level, per-block,...


feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629)
Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent a3cf35c3
......@@ -272,6 +272,9 @@ impl OpenAIPreprocessor {
builder.mdc_sum(Some(self.mdcsum.clone()));
let lora_name = self.lora_name.clone();
// Extract cache_control TTL from either nvext or top-level field
let cache_control_ttl = request.effective_cache_control().map(|cc| cc.ttl_seconds());
// Extract routing hints from nvext if present
if let Some(nvext) = request.nvext() {
// Build routing hints from nvext fields
......@@ -289,10 +292,12 @@ impl OpenAIPreprocessor {
allowed_worker_ids: None,
};
builder.routing(Some(routing));
} else if lora_name.is_some() {
// Ensure LoRA-aware routing still gets hints even when nvext is absent.
} else if lora_name.is_some() || cache_control_ttl.is_some() {
// Ensure routing hints exist when we have LoRA or cache_control,
// even when nvext is absent (e.g. Anthropic endpoint requests).
builder.routing(Some(RoutingHints {
lora_name,
cache_control_ttl,
..Default::default()
}));
}
......
......@@ -30,6 +30,7 @@ pub struct AnthropicStreamConverter {
// Token usage (from engine)
input_token_count: u32,
output_token_count: u32,
cached_token_count: Option<u32>,
// Tool call tracking
tool_call_states: Vec<ToolCallState>,
tool_calls_sent: HashSet<String>,
......@@ -57,6 +58,7 @@ impl AnthropicStreamConverter {
text_block_index: 0,
input_token_count: 0,
output_token_count: 0,
cached_token_count: None,
tool_call_states: Vec::new(),
tool_calls_sent: HashSet::new(),
next_block_index: 0,
......@@ -77,6 +79,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage {
input_tokens: 0,
output_tokens: 0,
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
};
......@@ -95,6 +99,10 @@ impl AnthropicStreamConverter {
if let Some(usage) = &chunk.usage {
self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens);
}
for choice in &chunk.choices {
......@@ -138,6 +146,7 @@ impl AnthropicStreamConverter {
index: self.text_block_index,
content_block: AnthropicResponseContentBlock::Text {
text: String::new(),
citations: None,
},
};
events.push(make_sse_event("content_block_start", &block_start));
......@@ -271,6 +280,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage {
input_tokens: self.input_token_count,
output_tokens: self.output_token_count,
cache_creation_input_tokens: None,
cache_read_input_tokens: self.cached_token_count,
},
};
events.push(make_sse_event("message_delta", &message_delta));
......@@ -329,6 +340,10 @@ impl AnthropicStreamConverter {
if let Some(usage) = &chunk.usage {
self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
.prompt_tokens_details
.as_ref()
.and_then(|d| d.cached_tokens);
}
for choice in &chunk.choices {
......@@ -369,6 +384,7 @@ impl AnthropicStreamConverter {
index: self.text_block_index,
content_block: AnthropicResponseContentBlock::Text {
text: String::new(),
citations: None,
},
};
events.push(make_tagged_event("content_block_start", &ev));
......@@ -483,6 +499,8 @@ impl AnthropicStreamConverter {
usage: AnthropicUsage {
input_tokens: self.input_token_count,
output_tokens: self.output_token_count,
cache_creation_input_tokens: None,
cache_read_input_tokens: self.cached_token_count,
},
};
events.push(make_tagged_event("message_delta", &ev));
......
......@@ -22,15 +22,26 @@ use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
};
use crate::protocols::openai::common_ext::CommonExt;
use crate::protocols::openai::nvext::{CacheControl, NvExt};
// ---------------------------------------------------------------------------
// Custom deserializers
// ---------------------------------------------------------------------------
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemContent {
/// The concatenated text from all system blocks (or the plain string).
pub text: String,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "..."}]`.
fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
where
D: serde::Deserializer<'de>,
{
......@@ -44,16 +55,28 @@ where
#[derive(Deserialize)]
struct SystemBlock {
text: String,
#[serde(default)]
cache_control: Option<CacheControl>,
}
let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
Ok(maybe.map(|sp| match sp {
SystemPrompt::Text(s) => s,
SystemPrompt::Blocks(blocks) => blocks
.into_iter()
.map(|b| b.text)
.collect::<Vec<_>>()
.join("\n"),
SystemPrompt::Text(s) => SystemContent {
text: s,
cache_control: None,
},
SystemPrompt::Blocks(blocks) => {
let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
let text = blocks
.into_iter()
.map(|b| b.text)
.collect::<Vec<_>>()
.join("\n");
SystemContent {
text,
cache_control,
}
}
}))
}
......@@ -79,7 +102,7 @@ pub struct AnthropicCreateMessageRequest {
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<String>,
pub system: Option<SystemContent>,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if = "Option::is_none")]
......@@ -112,6 +135,50 @@ pub struct AnthropicCreateMessageRequest {
/// How the model should choose which tool to call.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<AnthropicToolChoice>,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be ≥ 1024 and < max_tokens).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingConfig>,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_tier: Option<String>,
/// Container identifier for stateful sandbox sessions.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_config: Option<serde_json::Value>,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThinkingConfig {
/// Either `"enabled"` or `"disabled"`.
#[serde(rename = "type")]
pub thinking_type: String,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if = "Option::is_none")]
pub budget_tokens: Option<u32>,
}
/// A single message in the conversation.
......@@ -143,15 +210,23 @@ pub enum AnthropicMessageContent {
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Unknown` instead
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type")]
pub enum AnthropicContentBlock {
/// Text content block.
/// Text content block. May optionally include `citations` — references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename = "text")]
Text { text: String },
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Image content block.
#[serde(rename = "image")]
Image { source: AnthropicImageSource },
......@@ -161,6 +236,8 @@ pub enum AnthropicContentBlock {
id: String,
name: String,
input: serde_json::Value,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Tool result from user.
#[serde(rename = "tool_result")]
......@@ -170,14 +247,45 @@ pub enum AnthropicContentBlock {
content: Option<ToolResultContent>,
#[serde(skip_serializing_if = "Option::is_none")]
is_error: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename = "thinking")]
Thinking { thinking: String, signature: String },
/// Catch-all for unrecognized block types. Silently accepted and skipped
/// during conversion so that new Anthropic features don't break the endpoint.
#[serde(skip)]
Unknown { block_type: String },
Thinking {
thinking: String,
signature: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Content of a `tool_result` block — either a plain string or an array of
......@@ -237,9 +345,21 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
let text = value
.get("text")
.and_then(|t| t.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("text"))?
.to_string();
Ok(AnthropicContentBlock::Text { text })
let citations: Option<Vec<serde_json::Value>> = value
.get("citations")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Text {
text,
citations,
cache_control,
})
}
"image" => {
let source: AnthropicImageSource =
......@@ -251,55 +371,112 @@ impl<'de> Deserialize<'de> for AnthropicContentBlock {
let id = value
.get("id")
.and_then(|v| v.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ToolUse { id, name, input })
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolUse {
id,
name,
input,
cache_control,
})
}
"tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content: Option<ToolResultContent> = value
.get("content")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let is_error = value.get("is_error").and_then(|v| v.as_bool());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolResult {
tool_use_id,
content,
is_error,
cache_control,
})
}
"thinking" => {
let thinking = value
.get("thinking")
.and_then(|v| v.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("thinking"))?
.to_string();
let signature = value
.get("signature")
.and_then(|v| v.as_str())
.unwrap_or("")
.ok_or_else(|| serde::de::Error::missing_field("signature"))?
.to_string();
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Thinking {
thinking,
signature,
cache_control,
})
}
other => {
tracing::debug!("Unknown Anthropic content block type '{}', skipping", other);
Ok(AnthropicContentBlock::Unknown {
block_type: other.to_string(),
"redacted_thinking" => {
let data = value
.get("data")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("data"))?
.to_string();
Ok(AnthropicContentBlock::RedactedThinking { data })
}
"server_tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
}
"web_search_tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content = value
.get("content")
.cloned()
.unwrap_or(serde_json::json!([]));
Ok(AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
})
}
other => {
tracing::debug!(
"Unrecognized Anthropic content block type '{}', preserving as Other",
other
);
Ok(AnthropicContentBlock::Other(value))
}
}
}
}
......@@ -314,12 +491,29 @@ pub struct AnthropicImageSource {
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicTool {
/// Tool name (required for client tools, present on server tools too).
pub name: String,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
pub tool_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
pub input_schema: serde_json::Value,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_schema: Option<serde_json::Value>,
/// Cache control breakpoint on this tool definition.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Tool choice specification.
......@@ -338,6 +532,10 @@ pub enum AnthropicToolChoice {
pub struct AnthropicToolChoiceSimple {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
......@@ -355,6 +553,10 @@ pub struct AnthropicToolChoiceNamed {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
pub name: String,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
// ---------------------------------------------------------------------------
......@@ -376,17 +578,47 @@ pub struct AnthropicMessageResponse {
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicResponseContentBlock {
#[serde(rename = "text")]
Text { text: String },
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
},
#[serde(rename = "tool_use")]
ToolUse {
id: String,
name: String,
input: serde_json::Value,
},
#[serde(rename = "thinking")]
Thinking { thinking: String, signature: String },
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Token usage information.
......@@ -394,6 +626,12 @@ pub enum AnthropicResponseContentBlock {
pub struct AnthropicUsage {
pub input_tokens: u32,
pub output_tokens: u32,
/// Number of input tokens used to create a new cache entry.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_creation_input_tokens: Option<u32>,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_read_input_tokens: Option<u32>,
}
/// Reason the model stopped generating.
......@@ -404,6 +642,11 @@ pub enum AnthropicStopReason {
MaxTokens,
StopSequence,
ToolUse,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn,
/// The model refused to generate content (safety refusal).
Refusal,
}
// ---------------------------------------------------------------------------
......@@ -453,6 +696,15 @@ pub enum AnthropicDelta {
TextDelta { text: String },
#[serde(rename = "input_json_delta")]
InputJsonDelta { partial_json: String },
/// Incremental thinking content during extended thinking streaming.
#[serde(rename = "thinking_delta")]
ThinkingDelta { thinking: String },
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename = "signature_delta")]
SignatureDelta { signature: String },
/// Incremental citation attached to a text block.
#[serde(rename = "citations_delta")]
CitationsDelta { citation: serde_json::Value },
}
/// The delta body in a message_delta event.
......@@ -529,10 +781,12 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
let mut messages = Vec::new();
// Prepend system message if present
if let Some(system_text) = &req.system {
if let Some(system_content) = &req.system {
messages.push(ChatCompletionRequestMessage::System(
ChatCompletionRequestSystemMessage {
content: ChatCompletionRequestSystemMessageContent::Text(system_text.clone()),
content: ChatCompletionRequestSystemMessageContent::Text(
system_content.text.clone(),
),
name: None,
},
));
......@@ -610,7 +864,41 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
top_k: req.top_k.map(|k| k as i32),
..Default::default()
},
nvext: None,
nvext: {
// Collect per-block cache_control: use the last one found
let mut last_block_cc: Option<CacheControl> = None;
for msg in &req.messages {
if let AnthropicMessageContent::Blocks { content } = &msg.content {
for block in content {
let block_cc = match block {
AnthropicContentBlock::Text { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::ToolUse { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::ToolResult { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::Thinking { cache_control, .. } => {
cache_control.as_ref()
}
_ => None,
};
if let Some(cc) = block_cc {
last_block_cc = Some(cc.clone());
}
}
}
}
// Merge: top-level > per-block > system block cache_control
let system_cc = req.system.as_ref().and_then(|s| s.cache_control.clone());
let effective_cc = req.cache_control.clone().or(last_block_cc).or(system_cc);
effective_cc.map(|cc| NvExt {
cache_control: Some(cc),
..Default::default()
})
},
chat_template_args: None,
media_io_kwargs: None,
unsupported_fields: Default::default(),
......@@ -629,7 +917,7 @@ fn convert_user_blocks(
for block in blocks {
match block {
AnthropicContentBlock::Text { text } => {
AnthropicContentBlock::Text { text, .. } => {
text_parts.push(text.clone());
}
AnthropicContentBlock::ToolResult {
......@@ -664,8 +952,11 @@ fn convert_user_blocks(
}
AnthropicContentBlock::ToolUse { .. }
| AnthropicContentBlock::Thinking { .. }
| AnthropicContentBlock::Unknown { .. } => {
// tool_use/thinking/unknown in a user message: skip
| AnthropicContentBlock::RedactedThinking { .. }
| AnthropicContentBlock::ServerToolUse { .. }
| AnthropicContentBlock::WebSearchToolResult { .. }
| AnthropicContentBlock::Other(_) => {
// tool_use/thinking/server-side blocks/unknown in a user message: skip
}
}
}
......@@ -715,7 +1006,7 @@ fn convert_assistant_blocks(
for block in blocks {
match block {
AnthropicContentBlock::Text { text } => {
AnthropicContentBlock::Text { text, .. } => {
text_content.push_str(text);
}
AnthropicContentBlock::Thinking { thinking, .. } => {
......@@ -724,8 +1015,21 @@ fn convert_assistant_blocks(
}
pending_reasoning.push_str(thinking);
}
AnthropicContentBlock::ToolUse { id, name, input } => {
AnthropicContentBlock::RedactedThinking { .. } => {
// Redacted thinking is encrypted model reasoning. We can't read
// it but we preserve its position so it's not silently dropped.
// The actual encrypted data would need to be passed back to the
// model in multi-turn conversations for context continuity.
}
AnthropicContentBlock::ToolUse {
id, name, input, ..
}
| AnthropicContentBlock::ServerToolUse {
id, name, input, ..
} => {
// Snapshot the reasoning that preceded this tool call.
// Server-initiated tool use (e.g. web search) is treated the
// same as client tool use for conversion purposes.
segments.push(std::mem::take(&mut pending_reasoning));
tool_calls.push(ChatCompletionMessageToolCall {
id: id.clone(),
......@@ -798,14 +1102,27 @@ fn convert_assistant_blocks(
fn convert_anthropic_tools(tools: &[AnthropicTool]) -> Vec<ChatCompletionTool> {
tools
.iter()
.map(|tool| ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: tool.name.clone(),
description: tool.description.clone(),
parameters: Some(tool.input_schema.clone()),
strict: None,
},
.filter_map(|tool| {
// Server tools (web_search, bash, etc.) don't have input_schema
// and can't be meaningfully converted to OpenAI function tools.
// They are backend-specific and handled separately.
let schema = tool.input_schema.clone().or_else(|| {
tracing::debug!(
tool_name = %tool.name,
tool_type = ?tool.tool_type,
"Skipping server tool in OpenAI conversion (no input_schema)"
);
None
})?;
Some(ChatCompletionTool {
r#type: ChatCompletionToolType::Function,
function: FunctionObject {
name: tool.name.clone(),
description: tool.description.clone(),
parameters: Some(schema),
strict: None,
},
})
})
.collect()
}
......@@ -877,6 +1194,20 @@ pub fn chat_completion_to_anthropic_response(
}
}
// Extract reasoning content (from --dyn-reasoning-parser, e.g. qwen3).
// The backend strips <think>...</think> from the text and surfaces it
// as reasoning_content on the message. Map this to a Thinking block
// so clients see proper extended thinking in the Anthropic response.
if let Some(thinking) = choice.message.reasoning_content.filter(|t| !t.is_empty()) {
content.insert(
0,
AnthropicResponseContentBlock::Thinking {
thinking,
signature: String::new(),
},
);
}
// Extract text content
let text = match choice.message.content {
Some(dynamo_async_openai::types::ChatCompletionMessageContent::Text(t)) => Some(t),
......@@ -889,8 +1220,11 @@ pub fn chat_completion_to_anthropic_response(
None => None,
};
if let Some(text) = text {
// Text goes first in the content array
content.insert(0, AnthropicResponseContentBlock::Text { text });
// Text goes after thinking block (if any)
content.push(AnthropicResponseContentBlock::Text {
text,
citations: None,
});
}
}
......@@ -898,15 +1232,24 @@ pub fn chat_completion_to_anthropic_response(
if content.is_empty() {
content.push(AnthropicResponseContentBlock::Text {
text: String::new(),
citations: None,
});
}
// Map usage
let usage = chat_resp
.usage
.map(|u| AnthropicUsage {
input_tokens: u.prompt_tokens,
output_tokens: u.completion_tokens,
.map(|u| {
let cache_read_input_tokens = u
.prompt_tokens_details
.and_then(|d| d.cached_tokens)
.filter(|&n| n > 0);
AnthropicUsage {
input_tokens: u.prompt_tokens,
output_tokens: u.completion_tokens,
cache_creation_input_tokens: None, // Not available from OpenAI format
cache_read_input_tokens,
}
})
.unwrap_or_default();
......@@ -936,7 +1279,7 @@ pub struct AnthropicCountTokensRequest {
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<String>,
pub system: Option<SystemContent>,
#[serde(default)]
pub tools: Option<Vec<AnthropicTool>>,
}
......@@ -953,7 +1296,7 @@ impl AnthropicCountTokensRequest {
let mut total_len: usize = 0;
if let Some(system) = &self.system {
total_len += system.len();
total_len += system.text.len();
}
for msg in &self.messages {
......@@ -979,7 +1322,9 @@ impl AnthropicCountTokensRequest {
if let Some(desc) = &tool.description {
total_len += desc.len();
}
total_len += tool.input_schema.to_string().len();
if let Some(schema) = &tool.input_schema {
total_len += schema.to_string().len();
}
}
}
......@@ -994,7 +1339,7 @@ impl AnthropicCountTokensRequest {
fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
match block {
AnthropicContentBlock::Text { text } => text.len(),
AnthropicContentBlock::Text { text, .. } => text.len(),
AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
AnthropicContentBlock::ToolResult { content, .. } => content
.as_ref()
......@@ -1010,8 +1355,13 @@ fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
})
.unwrap_or(0),
AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
AnthropicContentBlock::ServerToolUse { name, input, .. } => {
name.len() + input.to_string().len()
}
AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
AnthropicContentBlock::Unknown { .. } => 0,
AnthropicContentBlock::Other(v) => v.to_string().len(),
}
}
......@@ -1043,6 +1393,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1073,7 +1428,10 @@ mod tests {
content: "Hi".into(),
},
}],
system: Some("You are helpful.".into()),
system: Some(SystemContent {
text: "You are helpful.".into(),
cache_control: None,
}),
temperature: None,
top_p: None,
top_k: None,
......@@ -1082,6 +1440,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1115,6 +1478,7 @@ mod tests {
id: "tool_123".into(),
name: "get_weather".into(),
input: serde_json::json!({"location": "SF"}),
cache_control: None,
}],
},
},
......@@ -1125,6 +1489,7 @@ mod tests {
tool_use_id: "tool_123".into(),
content: Some(ToolResultContent::Text("72F and sunny".into())),
is_error: None,
cache_control: None,
}],
},
},
......@@ -1138,6 +1503,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1176,6 +1546,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1202,16 +1577,24 @@ mod tests {
metadata: None,
tools: Some(vec![AnthropicTool {
name: "get_weather".into(),
tool_type: None,
description: Some("Get weather info".into()),
input_schema: serde_json::json!({
input_schema: Some(serde_json::json!({
"type": "object",
"properties": {"location": {"type": "string"}},
"required": ["location"]
}),
})),
cache_control: None,
}]),
tool_choice: Some(AnthropicToolChoice::Simple(AnthropicToolChoiceSimple {
choice_type: AnthropicToolChoiceMode::Auto,
disable_parallel_tool_use: None,
})),
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1274,7 +1657,7 @@ mod tests {
assert_eq!(response.usage.output_tokens, 5);
assert_eq!(response.content.len(), 1);
match &response.content[0] {
AnthropicResponseContentBlock::Text { text } => {
AnthropicResponseContentBlock::Text { text, .. } => {
assert_eq!(text, "Hello!");
}
_ => panic!("expected text block"),
......@@ -1335,6 +1718,7 @@ mod tests {
AnthropicContentBlock::Thinking {
thinking,
signature,
..
} => {
assert_eq!(thinking, "Let me reason about this...");
assert_eq!(signature, "sig123");
......@@ -1358,9 +1742,12 @@ mod tests {
AnthropicContentBlock::Thinking {
thinking: "I should think...".into(),
signature: "sig".into(),
cache_control: None,
},
AnthropicContentBlock::Text {
text: "Answer".into(),
citations: None,
cache_control: None,
},
],
},
......@@ -1374,6 +1761,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
......@@ -1395,7 +1787,7 @@ mod tests {
}
#[test]
fn test_unknown_block_type_does_not_fail() {
fn test_known_and_unknown_block_types() {
let json = r#"{
"model": "test",
"max_tokens": 100,
......@@ -1405,6 +1797,8 @@ mod tests {
{"type": "text", "text": "hello"},
{"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {}},
{"type": "redacted_thinking", "data": "encrypted"},
{"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://example.com"}]},
{"type": "future_block_type", "some_field": 42},
{"type": "text", "text": "world"}
]
}]
......@@ -1412,22 +1806,32 @@ mod tests {
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 4);
assert_eq!(content.len(), 6);
assert!(matches!(&content[0], AnthropicContentBlock::Text { .. }));
assert!(matches!(
&content[1],
AnthropicContentBlock::Unknown { block_type } if block_type == "server_tool_use"
AnthropicContentBlock::ServerToolUse { name, .. } if name == "web_search"
));
assert!(matches!(
&content[2],
AnthropicContentBlock::Unknown { block_type } if block_type == "redacted_thinking"
AnthropicContentBlock::RedactedThinking { data } if data == "encrypted"
));
assert!(matches!(
&content[3],
AnthropicContentBlock::WebSearchToolResult { tool_use_id, .. } if tool_use_id == "stu_1"
));
// Truly unknown types still fall through to Other with full JSON preserved
assert!(matches!(
&content[4],
AnthropicContentBlock::Other(v) if v.get("type").and_then(|t| t.as_str()) == Some("future_block_type")
));
assert!(matches!(&content[3], AnthropicContentBlock::Text { .. }));
assert!(matches!(&content[5], AnthropicContentBlock::Text { .. }));
}
_ => panic!("expected blocks content"),
}
// Conversion should succeed, skipping unknown blocks
// Conversion should succeed — server_tool_use becomes a tool call,
// redacted_thinking and web_search_tool_result are preserved gracefully
let chat_req: NvCreateChatCompletionRequest = AnthropicCreateMessageRequest {
model: "test".into(),
max_tokens: 100,
......@@ -1441,10 +1845,25 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
}
.try_into()
.unwrap();
// server_tool_use becomes a tool call on the assistant message
assert_eq!(chat_req.inner.messages.len(), 1);
match &chat_req.inner.messages[0] {
ChatCompletionRequestMessage::Assistant(a) => {
assert!(a.tool_calls.is_some());
let tc = a.tool_calls.as_ref().unwrap();
assert_eq!(tc.len(), 1);
assert_eq!(tc[0].function.name, "web_search");
}
other => panic!("expected assistant, got {other:?}"),
}
}
#[test]
......@@ -1510,7 +1929,10 @@ mod tests {
content: "Hello, world! This is a test message.".into(),
},
}],
system: Some("You are helpful.".into()),
system: Some(SystemContent {
text: "You are helpful.".into(),
cache_control: None,
}),
tools: None,
};
......@@ -1539,6 +1961,11 @@ mod tests {
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match chat_req.inner.messages.into_iter().next().unwrap() {
......@@ -1552,6 +1979,7 @@ mod tests {
id: id.into(),
name: "fn".into(),
input: serde_json::json!({}),
cache_control: None,
}
}
......@@ -1559,6 +1987,7 @@ mod tests {
AnthropicContentBlock::Thinking {
thinking: text.into(),
signature: "sig".into(),
cache_control: None,
}
}
......@@ -1686,6 +2115,8 @@ mod tests {
thinking("A"),
AnthropicContentBlock::Text {
text: "answer".into(),
citations: None,
cache_control: None,
},
]);
......@@ -1776,4 +2207,387 @@ mod tests {
assert_eq!(tools[0].id, "t1");
assert_eq!(tools[1].id, "t2");
}
#[test]
fn test_cache_control_passthrough() {
use crate::protocols::openai::nvext::{CacheControl, CacheControlType};
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: Some(CacheControl {
control_type: CacheControlType::Ephemeral,
ttl: None,
}),
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext
.cache_control
.expect("nvext.cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 300);
}
#[test]
fn test_cache_control_1h_ttl_passthrough() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"cache_control": {"type": "ephemeral", "ttl": "1h"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
assert!(req.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext
.cache_control
.expect("nvext.cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 3600);
}
#[test]
fn test_no_cache_control_passthrough() {
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_per_block_cache_control_deserialization() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": "Hello", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "World"}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
match &content[0] {
AnthropicContentBlock::Text { cache_control, .. } => {
assert!(cache_control.is_some());
}
other => panic!("expected Text, got {other:?}"),
}
match &content[1] {
AnthropicContentBlock::Text { cache_control, .. } => {
assert!(cache_control.is_none());
}
other => panic!("expected Text, got {other:?}"),
}
}
_ => panic!("expected blocks"),
}
}
#[test]
fn test_per_block_cache_control_last_wins() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "system context", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "recent context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext.cache_control.expect("cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
assert_eq!(cc.ttl_seconds(), 3600); // Last block's 1h TTL wins
}
#[test]
fn test_top_level_cache_control_overrides_per_block() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
],
"cache_control": {"type": "ephemeral"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req.nvext.expect("nvext should be set");
let cc = nvext.cache_control.expect("cache_control should be set");
// Top-level (no TTL = 300s default) takes precedence over per-block (1h)
assert_eq!(cc.ttl_seconds(), 300);
}
#[test]
fn test_system_block_array_with_cache_control() {
use crate::protocols::openai::nvext::CacheControlType;
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": [
{"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "Be concise."}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let system = req.system.as_ref().unwrap();
assert_eq!(system.text, "You are a helpful assistant.\nBe concise.");
// The LAST block with cache_control wins (first block here)
assert!(system.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
let nvext = chat_req
.nvext
.expect("nvext should be set from system cache_control");
let cc = nvext.cache_control.expect("cache_control should be set");
assert_eq!(cc.control_type, CacheControlType::Ephemeral);
}
#[test]
fn test_system_string_no_cache_control() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": "You are helpful."
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let system = req.system.as_ref().unwrap();
assert_eq!(system.text, "You are helpful.");
assert!(system.cache_control.is_none());
}
#[test]
fn test_text_block_with_citations() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{
"type": "text",
"text": "According to the document...",
"citations": [
{"type": "char_location", "cited_text": "relevant text", "document_index": 0, "start_char_index": 0, "end_char_index": 13}
]
}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => match &content[0] {
AnthropicContentBlock::Text { citations, .. } => {
assert!(citations.is_some());
let cites = citations.as_ref().unwrap();
assert_eq!(cites.len(), 1);
assert_eq!(cites[0]["type"], "char_location");
}
other => panic!("expected Text, got {other:?}"),
},
_ => panic!("expected blocks"),
}
}
#[test]
fn test_redacted_thinking_block() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{"type": "thinking", "thinking": "visible reasoning", "signature": "sig1"},
{"type": "redacted_thinking", "data": "base64-encrypted-data"},
{"type": "text", "text": "Final answer"}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 3);
assert!(matches!(
&content[0],
AnthropicContentBlock::Thinking { .. }
));
match &content[1] {
AnthropicContentBlock::RedactedThinking { data } => {
assert_eq!(data, "base64-encrypted-data");
}
other => panic!("expected RedactedThinking, got {other:?}"),
}
assert!(matches!(&content[2], AnthropicContentBlock::Text { .. }));
}
_ => panic!("expected blocks"),
}
}
#[test]
fn test_server_tool_use_and_web_search_result() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{
"role": "assistant",
"content": [
{"type": "server_tool_use", "id": "stu_1", "name": "web_search", "input": {"query": "rust programming"}},
{"type": "web_search_tool_result", "tool_use_id": "stu_1", "content": [{"type": "web_search_result", "url": "https://www.rust-lang.org", "title": "Rust"}]},
{"type": "text", "text": "Based on my search..."}
]
}]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.messages[0].content {
AnthropicMessageContent::Blocks { content } => {
assert_eq!(content.len(), 3);
match &content[0] {
AnthropicContentBlock::ServerToolUse { id, name, input } => {
assert_eq!(id, "stu_1");
assert_eq!(name, "web_search");
assert_eq!(input["query"], "rust programming");
}
other => panic!("expected ServerToolUse, got {other:?}"),
}
match &content[1] {
AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
} => {
assert_eq!(tool_use_id, "stu_1");
assert!(content.is_array());
}
other => panic!("expected WebSearchToolResult, got {other:?}"),
}
}
_ => panic!("expected blocks"),
}
// ServerToolUse should convert to a tool call
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
match &chat_req.inner.messages[0] {
ChatCompletionRequestMessage::Assistant(a) => {
let tc = a.tool_calls.as_ref().expect("should have tool calls");
assert_eq!(tc.len(), 1);
assert_eq!(tc[0].id, "stu_1");
assert_eq!(tc[0].function.name, "web_search");
}
other => panic!("expected assistant, got {other:?}"),
}
}
#[test]
fn test_thinking_config_deserialization() {
let json = r#"{
"model": "test",
"max_tokens": 16000,
"messages": [{"role": "user", "content": "Solve this step by step"}],
"thinking": {"type": "enabled", "budget_tokens": 10000}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let thinking = req.thinking.as_ref().expect("thinking should be set");
assert_eq!(thinking.thinking_type, "enabled");
assert_eq!(thinking.budget_tokens, Some(10000));
}
#[test]
fn test_thinking_config_disabled() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"thinking": {"type": "disabled"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let thinking = req.thinking.as_ref().expect("thinking should be set");
assert_eq!(thinking.thinking_type, "disabled");
assert!(thinking.budget_tokens.is_none());
}
#[test]
fn test_disable_parallel_tool_use() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"tools": [{"name": "get_weather", "description": "Get weather", "input_schema": {"type": "object"}}],
"tool_choice": {"type": "auto", "disable_parallel_tool_use": true}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
match &req.tool_choice {
Some(AnthropicToolChoice::Simple(s)) => {
assert_eq!(s.choice_type, AnthropicToolChoiceMode::Auto);
assert_eq!(s.disable_parallel_tool_use, Some(true));
}
other => panic!("expected Simple tool choice, got {other:?}"),
}
}
}
......@@ -92,6 +92,10 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
fn raw_prompt(&self) -> Option<String> {
None
}
fn effective_cache_control(&self) -> Option<&crate::protocols::openai::nvext::CacheControl> {
NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
}
}
/// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,
......
......@@ -49,6 +49,13 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
pub trait NvExtProvider {
fn nvext(&self) -> Option<&NvExt>;
fn raw_prompt(&self) -> Option<String>;
/// Return the effective cache control for this request.
/// Default: delegates to `nvext.cache_control`. Implementations may override
/// to also check a top-level `cache_control` field (see `NvCreateChatCompletionRequest`).
fn effective_cache_control(&self) -> Option<&CacheControl> {
self.nvext().and_then(|ext| ext.cache_control.as_ref())
}
}
/// Worker ID information for disaggregated serving
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment