Unverified Commit 2887cd1c authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)

parent d6136f4a
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API types.
//!
//! Pure protocol types for the `/v1/messages` endpoint -- request, response,
//! streaming events, error shapes, and count-tokens types.
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
/// Anthropic-style cache control hint for prefix pinning with TTL.
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
pub struct CacheControl {
#[serde(rename = "type")]
pub control_type: CacheControlType,
/// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ttl: Option<String>,
}
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum CacheControlType {
#[default]
Ephemeral,
#[serde(other)]
Unknown,
}
const MIN_TTL_SECONDS: u64 = 300;
const MAX_TTL_SECONDS: u64 = 3600;
impl CacheControl {
/// Parse TTL string to seconds, clamped to [300, 3600].
///
/// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
/// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
/// Unrecognized strings default to 300s.
pub fn ttl_seconds(&self) -> u64 {
let raw = match self.ttl.as_deref() {
None => return MIN_TTL_SECONDS,
Some("5m") => 300,
Some("1h") => 3600,
Some(other) => match other.parse::<u64>() {
Ok(secs) => secs,
Err(_) => {
tracing::warn!("Unrecognized TTL '{}', defaulting to 300s", other);
return MIN_TTL_SECONDS;
}
},
};
raw.clamp(MIN_TTL_SECONDS, MAX_TTL_SECONDS)
}
}
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemContent {
/// The concatenated text from all system blocks (or the plain string).
pub text: String,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum SystemPrompt {
Text(String),
Blocks(Vec<SystemBlock>),
}
#[derive(Deserialize)]
struct SystemBlock {
text: String,
#[serde(default)]
cache_control: Option<CacheControl>,
}
let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
Ok(maybe.map(|sp| match sp {
SystemPrompt::Text(s) => SystemContent {
text: s,
cache_control: None,
},
SystemPrompt::Blocks(blocks) => {
let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
let text = blocks
.into_iter()
.map(|b| b.text)
.collect::<Vec<_>>()
.join("\n");
SystemContent {
text,
cache_control,
}
}
}))
}
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicCreateMessageRequest {
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub model: String,
/// The maximum number of tokens to generate.
pub max_tokens: u32,
/// The conversation messages.
pub messages: Vec<AnthropicMessage>,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<SystemContent>,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f32>,
/// Top-K sampling parameter.
#[serde(skip_serializing_if = "Option::is_none")]
pub top_k: Option<u32>,
/// Custom stop sequences.
#[serde(skip_serializing_if = "Option::is_none")]
pub stop_sequences: Option<Vec<String>>,
/// Whether to stream the response.
#[serde(default)]
pub stream: bool,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<serde_json::Value>,
/// Tools the model may call.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<AnthropicTool>>,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<AnthropicToolChoice>,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be >= 1024 and < max_tokens).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingConfig>,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_tier: Option<String>,
/// Container identifier for stateful sandbox sessions.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_config: Option<serde_json::Value>,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThinkingConfig {
/// Either `"enabled"` or `"disabled"`.
#[serde(rename = "type")]
pub thinking_type: String,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if = "Option::is_none")]
pub budget_tokens: Option<u32>,
}
/// A single message in the conversation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessage {
pub role: AnthropicRole,
#[serde(flatten)]
pub content: AnthropicMessageContent,
}
/// The role of a message sender.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AnthropicRole {
User,
Assistant,
}
/// Message content -- either a plain string or an array of content blocks.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum AnthropicMessageContent {
/// Plain text content.
Text { content: String },
/// Array of structured content blocks.
Blocks { content: Vec<AnthropicContentBlock> },
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type")]
pub enum AnthropicContentBlock {
/// Text content block. May optionally include `citations` -- references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename = "text")]
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Image content block.
#[serde(rename = "image")]
Image { source: AnthropicImageSource },
/// Tool use request from assistant.
#[serde(rename = "tool_use")]
ToolUse {
id: String,
name: String,
input: serde_json::Value,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Tool result from user.
#[serde(rename = "tool_result")]
ToolResult {
tool_use_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
content: Option<ToolResultContent>,
#[serde(skip_serializing_if = "Option::is_none")]
is_error: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename = "thinking")]
Thinking {
thinking: String,
signature: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Content of a `tool_result` block -- either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolResultContent {
Text(String),
Blocks(Vec<ToolResultContentBlock>),
}
impl ToolResultContent {
/// Extract the text content, concatenating array blocks if needed.
pub fn into_text(self) -> String {
match self {
ToolResultContent::Text(s) => s,
ToolResultContent::Blocks(blocks) => blocks
.into_iter()
.filter_map(|b| match b {
ToolResultContentBlock::Text { text } => Some(text),
ToolResultContentBlock::Other(_) => None,
})
.collect::<Vec<_>>()
.join(""),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolResultContentBlock {
Text {
text: String,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other(serde_json::Value),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl<'de> Deserialize<'de> for AnthropicContentBlock {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let value = serde_json::Value::deserialize(deserializer)?;
let block_type = value
.get("type")
.and_then(|t| t.as_str())
.unwrap_or("")
.to_string();
match block_type.as_str() {
"text" => {
let text = value
.get("text")
.and_then(|t| t.as_str())
.ok_or_else(|| serde::de::Error::missing_field("text"))?
.to_string();
let citations: Option<Vec<serde_json::Value>> = value
.get("citations")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Text {
text,
citations,
cache_control,
})
}
"image" => {
let source: AnthropicImageSource =
serde_json::from_value(value.get("source").cloned().unwrap_or_default())
.map_err(serde::de::Error::custom)?;
Ok(AnthropicContentBlock::Image { source })
}
"tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolUse {
id,
name,
input,
cache_control,
})
}
"tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content: Option<ToolResultContent> = value
.get("content")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let is_error = value.get("is_error").and_then(|v| v.as_bool());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolResult {
tool_use_id,
content,
is_error,
cache_control,
})
}
"thinking" => {
let thinking = value
.get("thinking")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("thinking"))?
.to_string();
let signature = value
.get("signature")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("signature"))?
.to_string();
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Thinking {
thinking,
signature,
cache_control,
})
}
"redacted_thinking" => {
let data = value
.get("data")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("data"))?
.to_string();
Ok(AnthropicContentBlock::RedactedThinking { data })
}
"server_tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
}
"web_search_tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content = value
.get("content")
.cloned()
.unwrap_or(serde_json::json!([]));
Ok(AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
})
}
other => {
tracing::debug!(
"Unrecognized Anthropic content block type '{}', preserving as Other",
other
);
Ok(AnthropicContentBlock::Other(value))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicImageSource {
#[serde(rename = "type")]
pub source_type: String,
pub media_type: String,
pub data: String,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicTool {
/// Tool name (required for client tools, present on server tools too).
pub name: String,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
pub tool_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_schema: Option<serde_json::Value>,
/// Cache control breakpoint on this tool definition.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Tool choice specification.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum AnthropicToolChoice {
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named(AnthropicToolChoiceNamed),
/// Simple mode: "auto", "any", or "none".
Simple(AnthropicToolChoiceSimple),
}
/// Simple tool choice modes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicToolChoiceSimple {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AnthropicToolChoiceMode {
Auto,
Any,
None,
Tool,
}
/// Named tool choice.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicToolChoiceNamed {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
pub name: String,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessageResponse {
pub id: String,
#[serde(rename = "type")]
pub object_type: String,
pub role: String,
pub content: Vec<AnthropicResponseContentBlock>,
pub model: String,
pub stop_reason: Option<AnthropicStopReason>,
pub stop_sequence: Option<String>,
pub usage: AnthropicUsage,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicResponseContentBlock {
#[serde(rename = "thinking")]
Thinking { thinking: String, signature: String },
#[serde(rename = "text")]
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
},
#[serde(rename = "tool_use")]
ToolUse {
id: String,
name: String,
input: serde_json::Value,
},
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Token usage information.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct AnthropicUsage {
pub input_tokens: u32,
pub output_tokens: u32,
/// Number of input tokens used to create a new cache entry.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_creation_input_tokens: Option<u32>,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_read_input_tokens: Option<u32>,
}
/// Reason the model stopped generating.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum AnthropicStopReason {
EndTurn,
MaxTokens,
StopSequence,
ToolUse,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn,
/// The model refused to generate content (safety refusal).
Refusal,
}
/// SSE event types for the Anthropic streaming API.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicStreamEvent {
#[serde(rename = "message_start")]
MessageStart { message: AnthropicMessageResponse },
#[serde(rename = "content_block_start")]
ContentBlockStart {
index: u32,
content_block: AnthropicResponseContentBlock,
},
#[serde(rename = "content_block_delta")]
ContentBlockDelta { index: u32, delta: AnthropicDelta },
#[serde(rename = "content_block_stop")]
ContentBlockStop { index: u32 },
#[serde(rename = "message_delta")]
MessageDelta {
delta: AnthropicMessageDeltaBody,
usage: AnthropicUsage,
},
#[serde(rename = "message_stop")]
MessageStop {},
#[serde(rename = "ping")]
Ping {},
#[serde(rename = "error")]
Error { error: AnthropicErrorBody },
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicDelta {
#[serde(rename = "thinking_delta")]
ThinkingDelta { thinking: String },
#[serde(rename = "text_delta")]
TextDelta { text: String },
#[serde(rename = "input_json_delta")]
InputJsonDelta { partial_json: String },
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename = "signature_delta")]
SignatureDelta { signature: String },
/// Incremental citation attached to a text block.
#[serde(rename = "citations_delta")]
CitationsDelta { citation: serde_json::Value },
}
/// The delta body in a message_delta event.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessageDeltaBody {
pub stop_reason: Option<AnthropicStopReason>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stop_sequence: Option<String>,
}
/// Anthropic API error response wrapper.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicErrorResponse {
#[serde(rename = "type")]
pub object_type: String,
pub error: AnthropicErrorBody,
}
/// Error body within an error response.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicErrorBody {
#[serde(rename = "type")]
pub error_type: String,
pub message: String,
}
impl AnthropicErrorResponse {
/// Create an `invalid_request_error` response.
pub fn invalid_request(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "invalid_request_error".to_string(),
message: message.into(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub fn api_error(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "api_error".to_string(),
message: message.into(),
},
}
}
/// Create a `not_found_error` response.
pub fn not_found(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "not_found_error".to_string(),
message: message.into(),
},
}
}
}
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug, Clone, Deserialize)]
pub struct AnthropicCountTokensRequest {
pub model: String,
pub messages: Vec<AnthropicMessage>,
#[serde(
default,
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<SystemContent>,
#[serde(default)]
pub tools: Option<Vec<AnthropicTool>>,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug, Clone, Serialize)]
pub struct AnthropicCountTokensResponse {
pub input_tokens: u32,
}
impl AnthropicCountTokensRequest {
/// Estimate input token count using a `len/3` heuristic.
pub fn estimate_tokens(&self) -> u32 {
let mut total_len: usize = 0;
if let Some(system) = &self.system {
total_len += system.text.len();
}
for msg in &self.messages {
// Count role
total_len += match msg.role {
AnthropicRole::User => 4,
AnthropicRole::Assistant => 9,
};
// Count content
match &msg.content {
AnthropicMessageContent::Text { content } => total_len += content.len(),
AnthropicMessageContent::Blocks { content } => {
for block in content {
total_len += estimate_block_len(block);
}
}
}
}
if let Some(tools) = &self.tools {
for tool in tools {
total_len += tool.name.len();
if let Some(desc) = &tool.description {
total_len += desc.len();
}
if let Some(schema) = &tool.input_schema {
total_len += schema.to_string().len();
}
}
}
let tokens = total_len / 3;
if tokens == 0 && total_len > 0 {
1
} else {
tokens as u32
}
}
}
fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
match block {
AnthropicContentBlock::Text { text, .. } => text.len(),
AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
AnthropicContentBlock::ToolResult { content, .. } => content
.as_ref()
.map(|c| match c {
ToolResultContent::Text(s) => s.len(),
ToolResultContent::Blocks(blocks) => blocks
.iter()
.map(|b| match b {
ToolResultContentBlock::Text { text } => text.len(),
ToolResultContentBlock::Other(v) => v.to_string().len(),
})
.sum(),
})
.unwrap_or(0),
AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
AnthropicContentBlock::ServerToolUse { name, input, .. } => {
name.len() + input.to_string().len()
}
AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
AnthropicContentBlock::Other(v) => v.to_string().len(),
}
}
......@@ -1182,10 +1182,6 @@ pub struct CreateChatCompletionResponse {
/// The object type, which is always `chat.completion`.
pub object: String,
pub usage: Option<CompletionUsage>,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
......@@ -1281,10 +1277,6 @@ pub struct CreateChatCompletionStreamResponse {
/// An optional field that will only be present when you set `stream_options: {"include_usage": true}` in your request.
/// When present, it contains a null value except for the last chunk which contains the token usage statistics for the entire request.
pub usage: Option<CompletionUsage>,
/// NVIDIA extension field for response metadata
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
#[cfg(test)]
......
......@@ -224,10 +224,6 @@ pub struct CreateCompletionResponse {
/// The object type, which is always "text_completion"
pub object: String,
pub usage: Option<CompletionUsage>,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
......
......@@ -10,6 +10,7 @@
//! Types used in OpenAI API requests and responses.
//! These types are created from component schemas in the [OpenAPI spec](https://github.com/openai/openai-openapi)
pub mod anthropic;
mod assistant;
mod assistant_impls;
mod assistant_stream;
......
......@@ -90,14 +90,16 @@ where
tracing::warn!("audit: aggregation future canceled/failed");
// Return minimal response if aggregation failed
NvCreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
},
nvext: None,
}
})
......@@ -125,14 +127,16 @@ where
Err(e) => {
tracing::warn!("fold aggregation failed: {e}");
let fallback = NvCreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
},
nvext: None,
};
let _ = tx.send(fallback.clone());
......@@ -145,14 +149,16 @@ where
rx.await.unwrap_or_else(|_| {
tracing::warn!("fold aggregation future canceled");
NvCreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionResponse {
id: String::new(),
created: 0,
usage: None,
model: String::new(),
object: "chat.completion".to_string(),
system_fingerprint: None,
choices: vec![],
service_tier: None,
},
nvext: None,
}
})
......@@ -171,8 +177,8 @@ pub fn final_response_to_one_chunk_stream(
) -> std::pin::Pin<
Box<dyn futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>> + Send>,
> {
let mut choices: Vec<ChatChoiceStream> = Vec::with_capacity(resp.choices.len());
for (idx, ch) in resp.choices.iter().enumerate() {
let mut choices: Vec<ChatChoiceStream> = Vec::with_capacity(resp.inner.choices.len());
for (idx, ch) in resp.inner.choices.iter().enumerate() {
// Convert FunctionCall to FunctionCallStream if present
#[allow(deprecated)]
let function_call = ch.message.function_call.as_ref().map(|fc| {
......@@ -222,14 +228,16 @@ pub fn final_response_to_one_chunk_stream(
}
let chunk = NvCreateChatCompletionStreamResponse {
id: resp.id.clone(),
object: "chat.completion.chunk".to_string(),
created: resp.created,
model: resp.model.clone(),
system_fingerprint: resp.system_fingerprint.clone(),
service_tier: resp.service_tier.clone(),
choices,
usage: resp.usage.clone(),
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: resp.inner.id.clone(),
object: "chat.completion.chunk".to_string(),
created: resp.inner.created,
model: resp.inner.model.clone(),
system_fingerprint: resp.inner.system_fingerprint.clone(),
service_tier: resp.inner.service_tier.clone(),
choices,
usage: resp.inner.usage.clone(),
},
nvext: resp.nvext.clone(),
};
......@@ -275,14 +283,16 @@ mod tests {
};
let response = NvCreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![choice],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: Some("test-fingerprint".to_string()),
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![choice],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: Some("test-fingerprint".to_string()),
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
},
nvext: None,
};
......@@ -314,14 +324,16 @@ mod tests {
};
let response = NvCreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![choice],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: Some("test-fingerprint".to_string()),
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![choice],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: Some("test-fingerprint".to_string()),
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
},
nvext: None,
};
......@@ -339,7 +351,7 @@ mod tests {
chunk
.data
.as_ref()
.and_then(|d| d.choices.first())
.and_then(|d| d.inner.choices.first())
.and_then(|c| c.delta.content.as_ref())
.and_then(|content| match content {
ChatCompletionMessageContent::Text(text) => Some(text.clone()),
......@@ -396,7 +408,7 @@ mod tests {
assert_eq!(results.len(), 0, "Empty stream should produce no chunks");
// Verify fallback response (aggregation will fail on empty stream)
assert_eq!(final_resp.object, "chat.completion");
assert_eq!(final_resp.inner.object, "chat.completion");
// Should get fallback response, not panic
}
......@@ -415,7 +427,7 @@ mod tests {
assert_eq!(extract_content(&results[0]), "Single chunk");
// Verify aggregation
assert_eq!(final_resp.object, "chat.completion");
assert_eq!(final_resp.inner.object, "chat.completion");
}
#[tokio::test]
......@@ -423,32 +435,34 @@ mod tests {
// Test that metadata (id, event, comment) is preserved through passthrough
let chunk_with_metadata = Annotated {
data: Some(NvCreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![{
#[allow(deprecated)]
ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
role: Some(Role::Assistant),
content: Some(ChatCompletionMessageContent::Text(
"Content".to_string(),
)),
tool_calls: None,
function_call: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}
}],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![{
#[allow(deprecated)]
ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
role: Some(Role::Assistant),
content: Some(ChatCompletionMessageContent::Text(
"Content".to_string(),
)),
tool_calls: None,
function_call: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}
}],
created: 1234567890,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
},
nvext: None,
}),
id: Some("correlation-123".to_string()),
......@@ -481,7 +495,7 @@ mod tests {
let (resp1, resp2) = tokio::join!(future1, future2);
// Both should complete successfully
assert_eq!(resp1.object, "chat.completion");
assert_eq!(resp2.object, "chat.completion");
assert_eq!(resp1.inner.object, "chat.completion");
assert_eq!(resp2.inner.object, "chat.completion");
}
}
......@@ -238,8 +238,9 @@ async fn evaluate(
match (item.data.as_ref(), item.event.as_deref()) {
(Some(data), _) => {
// Normal case
let choice = data.choices.first();
let chat_comp = choice.as_ref().unwrap();
let Some(chat_comp) = data.inner.choices.first() else {
continue;
};
if let Some(c) = &chat_comp.delta.content {
match c {
ChatCompletionMessageContent::Text(text) => {
......
......@@ -138,8 +138,9 @@ async fn main_loop(
match (item.data.as_ref(), item.event.as_deref()) {
(Some(data), _) => {
// Normal case
let entry = data.choices.first();
let chat_comp = entry.as_ref().unwrap();
let Some(chat_comp) = data.inner.choices.first() else {
continue;
};
if let Some(c) = &chat_comp.delta.content {
match c {
ChatCompletionMessageContent::Text(text) => {
......
......@@ -991,7 +991,7 @@ fn streaming_tool_dispatch_events(
};
let mut events = vec![];
for choice in &data.choices {
for choice in &data.inner.choices {
let Some(tool_calls) = &choice.delta.tool_calls else {
continue;
};
......@@ -1034,7 +1034,7 @@ fn accumulate_reasoning_dispatch(
};
let mut events = vec![];
for choice in &data.choices {
for choice in &data.inner.choices {
let buffer = buffers.entry(choice.index).or_default();
let has_reasoning = choice
.delta
......@@ -2892,15 +2892,17 @@ mod tests {
// Create a normal data event
let normal_event = Annotated::<NvCreateChatCompletionStreamResponse> {
data: Some(CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![],
created: 0,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
service_tier: None,
usage: None,
data: Some(NvCreateChatCompletionStreamResponse {
inner: CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices: vec![],
created: 0,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
service_tier: None,
usage: None,
},
nvext: None,
}),
id: Some("msg-1".to_string()),
......@@ -3162,15 +3164,17 @@ mod tests {
fn make_stream_response(
choices: Vec<ChatChoiceStream>,
) -> Annotated<NvCreateChatCompletionStreamResponse> {
let response = CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices,
created: 0,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
let response = NvCreateChatCompletionStreamResponse {
inner: CreateChatCompletionStreamResponse {
id: "test-id".to_string(),
choices,
created: 0,
model: "test-model".to_string(),
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
service_tier: None,
},
nvext: None,
};
Annotated {
......
......@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
fn extract_logprobs_by_choice(&self) -> HashMap<u32, Vec<TokenLogProbs>> {
let mut result = HashMap::new();
for choice in &self.choices {
for choice in &self.inner.choices {
let choice_index = choice.index;
let choice_logprobs = choice
......@@ -949,34 +949,36 @@ mod tests {
) -> NvCreateChatCompletionStreamResponse {
#[expect(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"test".to_string(),
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"test".to_string(),
),
),
),
function_call: None,
tool_calls: None,
role: Some(Role::Assistant),
refusal: None,
reasoning_content: None,
},
finish_reason: Some(FinishReason::Stop),
stop_reason: None,
logprobs: Some(ChatChoiceLogprobs {
content: Some(token_logprobs),
refusal: None,
}),
}],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
function_call: None,
tool_calls: None,
role: Some(Role::Assistant),
refusal: None,
reasoning_content: None,
},
finish_reason: Some(FinishReason::Stop),
stop_reason: None,
logprobs: Some(ChatChoiceLogprobs {
content: Some(token_logprobs),
refusal: None,
}),
}],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
},
nvext: None,
}
}
......@@ -1012,14 +1014,16 @@ mod tests {
.collect();
NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices,
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices,
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
},
nvext: None,
}
}
......@@ -1341,31 +1345,33 @@ mod tests {
// Test with choice that has no logprobs
#[expect(deprecated)]
let response = NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"test".to_string(),
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"test".to_string(),
),
),
),
function_call: None,
tool_calls: None,
role: Some(Role::Assistant),
refusal: None,
reasoning_content: None,
},
finish_reason: Some(FinishReason::Stop),
stop_reason: None,
logprobs: None, // No logprobs
}],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
function_call: None,
tool_calls: None,
role: Some(Role::Assistant),
refusal: None,
reasoning_content: None,
},
finish_reason: Some(FinishReason::Stop),
stop_reason: None,
logprobs: None, // No logprobs
}],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
},
nvext: None,
};
......@@ -1573,14 +1579,16 @@ mod tests {
// In practice, this would have real logprobs data
NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
choices: vec![],
created: 1234567890,
model: "test-model".to_string(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".to_string(),
usage: None,
},
nvext: None,
}
}
......
......@@ -1217,7 +1217,7 @@ impl OpenAIPreprocessor {
let processed_response = if let Some(ref mut parser) = state.reasoning_parser {
response.map_data(|mut data| {
// Process all choices, not just the first one
for choice in data.choices.iter_mut() {
for choice in data.inner.choices.iter_mut() {
// Reasoning parsing only applies to text content
if let Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
......
......@@ -111,7 +111,7 @@ pub fn maybe_wrap_stream(
let mut prefill_tx = Some(tx);
Box::pin(stream.map(move |item| {
if let Some(ref resp) = item.data {
for choice in &resp.choices {
for choice in &resp.inner.choices {
if let Some(ChatCompletionMessageContent::Text(ref text)) = choice.delta.content {
accumulated_text.push_str(text);
}
......
......@@ -106,7 +106,7 @@ impl AnthropicStreamConverter {
let mut events = Vec::new();
// Capture real token usage from engine when available (typically on the final chunk).
if let Some(usage) = &chunk.usage {
if let Some(usage) = &chunk.inner.usage {
self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
......@@ -115,7 +115,7 @@ impl AnthropicStreamConverter {
.and_then(|d| d.cached_tokens);
}
for choice in &chunk.choices {
for choice in &chunk.inner.choices {
let delta = &choice.delta;
// Track finish reason
......@@ -444,7 +444,7 @@ impl AnthropicStreamConverter {
) -> Vec<TaggedEvent> {
let mut events = Vec::new();
if let Some(usage) = &chunk.usage {
if let Some(usage) = &chunk.inner.usage {
self.input_token_count = usage.prompt_tokens;
self.output_token_count = usage.completion_tokens;
self.cached_token_count = usage
......@@ -453,7 +453,7 @@ impl AnthropicStreamConverter {
.and_then(|d| d.cached_tokens);
}
for choice in &chunk.choices {
for choice in &chunk.inner.choices {
let delta = &choice.delta;
if let Some(ref fr) = choice.finish_reason {
......@@ -722,27 +722,29 @@ mod tests {
fn text_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(ChatCompletionMessageContent::Text(text.into())),
function_call: None,
tool_calls: None,
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(ChatCompletionMessageContent::Text(text.into())),
function_call: None,
tool_calls: None,
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
},
nvext: None,
}
}
......@@ -755,35 +757,37 @@ mod tests {
) -> NvCreateChatCompletionStreamResponse {
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: None,
function_call: None,
tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
index: tc_index,
id: id.map(String::from),
r#type: Some(ChatCompletionToolType::Function),
function: Some(FunctionCallStream {
name: name.map(String::from),
arguments: args.map(String::from),
}),
}]),
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: None,
function_call: None,
tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
index: tc_index,
id: id.map(String::from),
r#type: Some(ChatCompletionToolType::Function),
function: Some(FunctionCallStream {
name: name.map(String::from),
arguments: args.map(String::from),
}),
}]),
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
},
nvext: None,
}
}
......@@ -908,27 +912,29 @@ mod tests {
fn reasoning_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: None,
function_call: None,
tool_calls: None,
role: None,
refusal: None,
reasoning_content: Some(text.into()),
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: None,
function_call: None,
tool_calls: None,
role: None,
refusal: None,
reasoning_content: Some(text.into()),
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
},
nvext: None,
}
}
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API types and conversion logic.
//! Anthropic Messages API conversion logic.
//!
//! All request/response types for the `/v1/messages` endpoint, plus
//! bidirectional conversion to/from the internal chat completions format.
//! Pure protocol types live in `dynamo_async_openai::types::anthropic`.
//! This module provides bidirectional conversion to/from the internal
//! chat completions format used by the Dynamo engine.
// Re-export all pure Anthropic protocol types so existing `use crate::protocols::anthropic::*`
// continues to work throughout dynamo-llm.
pub use dynamo_async_openai::types::anthropic::*;
use dynamo_async_openai::types::{
ChatCompletionMessageToolCall, ChatCompletionNamedToolChoice,
......@@ -17,764 +22,13 @@ use dynamo_async_openai::types::{
ChatCompletionTool, ChatCompletionToolChoiceOption, ChatCompletionToolType, FunctionName,
FunctionObject, ImageUrl, ReasoningContent,
};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
};
use crate::protocols::openai::common_ext::CommonExt;
use crate::protocols::openai::nvext::{CacheControl, NvExt};
// ---------------------------------------------------------------------------
// Custom deserializers
// ---------------------------------------------------------------------------
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemContent {
/// The concatenated text from all system blocks (or the plain string).
pub text: String,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn deserialize_system_prompt<'de, D>(deserializer: D) -> Result<Option<SystemContent>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum SystemPrompt {
Text(String),
Blocks(Vec<SystemBlock>),
}
#[derive(Deserialize)]
struct SystemBlock {
text: String,
#[serde(default)]
cache_control: Option<CacheControl>,
}
let maybe: Option<SystemPrompt> = Option::deserialize(deserializer)?;
Ok(maybe.map(|sp| match sp {
SystemPrompt::Text(s) => SystemContent {
text: s,
cache_control: None,
},
SystemPrompt::Blocks(blocks) => {
let cache_control = blocks.iter().rev().find_map(|b| b.cache_control.clone());
let text = blocks
.into_iter()
.map(|b| b.text)
.collect::<Vec<_>>()
.join("\n");
SystemContent {
text,
cache_control,
}
}
}))
}
// ---------------------------------------------------------------------------
// Request types
// ---------------------------------------------------------------------------
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicCreateMessageRequest {
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub model: String,
/// The maximum number of tokens to generate.
pub max_tokens: u32,
/// The conversation messages.
pub messages: Vec<AnthropicMessage>,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<SystemContent>,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if = "Option::is_none")]
pub top_p: Option<f32>,
/// Top-K sampling parameter.
#[serde(skip_serializing_if = "Option::is_none")]
pub top_k: Option<u32>,
/// Custom stop sequences.
#[serde(skip_serializing_if = "Option::is_none")]
pub stop_sequences: Option<Vec<String>>,
/// Whether to stream the response.
#[serde(default)]
pub stream: bool,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<serde_json::Value>,
/// Tools the model may call.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<AnthropicTool>>,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<AnthropicToolChoice>,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be ≥ 1024 and < max_tokens).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingConfig>,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_tier: Option<String>,
/// Container identifier for stateful sandbox sessions.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_config: Option<serde_json::Value>,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ThinkingConfig {
/// Either `"enabled"` or `"disabled"`.
#[serde(rename = "type")]
pub thinking_type: String,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if = "Option::is_none")]
pub budget_tokens: Option<u32>,
}
/// A single message in the conversation.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessage {
pub role: AnthropicRole,
#[serde(flatten)]
pub content: AnthropicMessageContent,
}
/// The role of a message sender.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AnthropicRole {
User,
Assistant,
}
/// Message content — either a plain string or an array of content blocks.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum AnthropicMessageContent {
/// Plain text content.
Text { content: String },
/// Array of structured content blocks.
Blocks { content: Vec<AnthropicContentBlock> },
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type")]
pub enum AnthropicContentBlock {
/// Text content block. May optionally include `citations` — references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename = "text")]
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Image content block.
#[serde(rename = "image")]
Image { source: AnthropicImageSource },
/// Tool use request from assistant.
#[serde(rename = "tool_use")]
ToolUse {
id: String,
name: String,
input: serde_json::Value,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Tool result from user.
#[serde(rename = "tool_result")]
ToolResult {
tool_use_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
content: Option<ToolResultContent>,
#[serde(skip_serializing_if = "Option::is_none")]
is_error: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename = "thinking")]
Thinking {
thinking: String,
signature: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
cache_control: Option<CacheControl>,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Content of a `tool_result` block — either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolResultContent {
Text(String),
Blocks(Vec<ToolResultContentBlock>),
}
impl ToolResultContent {
/// Extract the text content, concatenating array blocks if needed.
pub fn into_text(self) -> String {
match self {
ToolResultContent::Text(s) => s,
ToolResultContent::Blocks(blocks) => blocks
.into_iter()
.filter_map(|b| match b {
ToolResultContentBlock::Text { text } => Some(text),
ToolResultContentBlock::Other(_) => None,
})
.collect::<Vec<_>>()
.join(""),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolResultContentBlock {
Text {
text: String,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other(serde_json::Value),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl<'de> Deserialize<'de> for AnthropicContentBlock {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let value = serde_json::Value::deserialize(deserializer)?;
let block_type = value
.get("type")
.and_then(|t| t.as_str())
.unwrap_or("")
.to_string();
match block_type.as_str() {
"text" => {
let text = value
.get("text")
.and_then(|t| t.as_str())
.ok_or_else(|| serde::de::Error::missing_field("text"))?
.to_string();
let citations: Option<Vec<serde_json::Value>> = value
.get("citations")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Text {
text,
citations,
cache_control,
})
}
"image" => {
let source: AnthropicImageSource =
serde_json::from_value(value.get("source").cloned().unwrap_or_default())
.map_err(serde::de::Error::custom)?;
Ok(AnthropicContentBlock::Image { source })
}
"tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolUse {
id,
name,
input,
cache_control,
})
}
"tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content: Option<ToolResultContent> = value
.get("content")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
let is_error = value.get("is_error").and_then(|v| v.as_bool());
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::ToolResult {
tool_use_id,
content,
is_error,
cache_control,
})
}
"thinking" => {
let thinking = value
.get("thinking")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("thinking"))?
.to_string();
let signature = value
.get("signature")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("signature"))?
.to_string();
let cache_control: Option<CacheControl> = value
.get("cache_control")
.cloned()
.and_then(|v| serde_json::from_value(v).ok());
Ok(AnthropicContentBlock::Thinking {
thinking,
signature,
cache_control,
})
}
"redacted_thinking" => {
let data = value
.get("data")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("data"))?
.to_string();
Ok(AnthropicContentBlock::RedactedThinking { data })
}
"server_tool_use" => {
let id = value
.get("id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("id"))?
.to_string();
let name = value
.get("name")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("name"))?
.to_string();
let input = value.get("input").cloned().unwrap_or(serde_json::json!({}));
Ok(AnthropicContentBlock::ServerToolUse { id, name, input })
}
"web_search_tool_result" => {
let tool_use_id = value
.get("tool_use_id")
.and_then(|v| v.as_str())
.ok_or_else(|| serde::de::Error::missing_field("tool_use_id"))?
.to_string();
let content = value
.get("content")
.cloned()
.unwrap_or(serde_json::json!([]));
Ok(AnthropicContentBlock::WebSearchToolResult {
tool_use_id,
content,
})
}
other => {
tracing::debug!(
"Unrecognized Anthropic content block type '{}', preserving as Other",
other
);
Ok(AnthropicContentBlock::Other(value))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicImageSource {
#[serde(rename = "type")]
pub source_type: String,
pub media_type: String,
pub data: String,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicTool {
/// Tool name (required for client tools, present on server tools too).
pub name: String,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default, rename = "type", skip_serializing_if = "Option::is_none")]
pub tool_type: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub input_schema: Option<serde_json::Value>,
/// Cache control breakpoint on this tool definition.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_control: Option<CacheControl>,
}
/// Tool choice specification.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum AnthropicToolChoice {
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named(AnthropicToolChoiceNamed),
/// Simple mode: "auto", "any", or "none".
Simple(AnthropicToolChoiceSimple),
}
/// Simple tool choice modes.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicToolChoiceSimple {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum AnthropicToolChoiceMode {
Auto,
Any,
None,
Tool,
}
/// Named tool choice.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicToolChoiceNamed {
#[serde(rename = "type")]
pub choice_type: AnthropicToolChoiceMode,
pub name: String,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disable_parallel_tool_use: Option<bool>,
}
// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessageResponse {
pub id: String,
#[serde(rename = "type")]
pub object_type: String,
pub role: String,
pub content: Vec<AnthropicResponseContentBlock>,
pub model: String,
pub stop_reason: Option<AnthropicStopReason>,
pub stop_sequence: Option<String>,
pub usage: AnthropicUsage,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicResponseContentBlock {
#[serde(rename = "thinking")]
Thinking { thinking: String, signature: String },
#[serde(rename = "text")]
Text {
text: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
citations: Option<Vec<serde_json::Value>>,
},
#[serde(rename = "tool_use")]
ToolUse {
id: String,
name: String,
input: serde_json::Value,
},
#[serde(rename = "redacted_thinking")]
RedactedThinking { data: String },
#[serde(rename = "server_tool_use")]
ServerToolUse {
id: String,
name: String,
#[serde(default)]
input: serde_json::Value,
},
#[serde(rename = "web_search_tool_result")]
WebSearchToolResult {
tool_use_id: String,
#[serde(default)]
content: serde_json::Value,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other(serde_json::Value),
}
/// Token usage information.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct AnthropicUsage {
pub input_tokens: u32,
pub output_tokens: u32,
/// Number of input tokens used to create a new cache entry.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_creation_input_tokens: Option<u32>,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cache_read_input_tokens: Option<u32>,
}
/// Reason the model stopped generating.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum AnthropicStopReason {
EndTurn,
MaxTokens,
StopSequence,
ToolUse,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn,
/// The model refused to generate content (safety refusal).
Refusal,
}
// ---------------------------------------------------------------------------
// Streaming types
// ---------------------------------------------------------------------------
/// SSE event types for the Anthropic streaming API.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicStreamEvent {
#[serde(rename = "message_start")]
MessageStart { message: AnthropicMessageResponse },
#[serde(rename = "content_block_start")]
ContentBlockStart {
index: u32,
content_block: AnthropicResponseContentBlock,
},
#[serde(rename = "content_block_delta")]
ContentBlockDelta { index: u32, delta: AnthropicDelta },
#[serde(rename = "content_block_stop")]
ContentBlockStop { index: u32 },
#[serde(rename = "message_delta")]
MessageDelta {
delta: AnthropicMessageDeltaBody,
usage: AnthropicUsage,
},
#[serde(rename = "message_stop")]
MessageStop {},
#[serde(rename = "ping")]
Ping {},
#[serde(rename = "error")]
Error { error: AnthropicErrorBody },
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum AnthropicDelta {
#[serde(rename = "thinking_delta")]
ThinkingDelta { thinking: String },
#[serde(rename = "text_delta")]
TextDelta { text: String },
#[serde(rename = "input_json_delta")]
InputJsonDelta { partial_json: String },
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename = "signature_delta")]
SignatureDelta { signature: String },
/// Incremental citation attached to a text block.
#[serde(rename = "citations_delta")]
CitationsDelta { citation: serde_json::Value },
}
/// The delta body in a message_delta event.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicMessageDeltaBody {
pub stop_reason: Option<AnthropicStopReason>,
#[serde(skip_serializing_if = "Option::is_none")]
pub stop_sequence: Option<String>,
}
// ---------------------------------------------------------------------------
// Error types
// ---------------------------------------------------------------------------
/// Anthropic API error response wrapper.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicErrorResponse {
#[serde(rename = "type")]
pub object_type: String,
pub error: AnthropicErrorBody,
}
/// Error body within an error response.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnthropicErrorBody {
#[serde(rename = "type")]
pub error_type: String,
pub message: String,
}
impl AnthropicErrorResponse {
/// Create an `invalid_request_error` response.
pub fn invalid_request(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "invalid_request_error".to_string(),
message: message.into(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub fn api_error(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "api_error".to_string(),
message: message.into(),
},
}
}
/// Create a `not_found_error` response.
pub fn not_found(message: impl Into<String>) -> Self {
Self {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: "not_found_error".to_string(),
message: message.into(),
},
}
}
}
// ---------------------------------------------------------------------------
// Conversion: AnthropicCreateMessageRequest -> NvCreateChatCompletionRequest
// ---------------------------------------------------------------------------
use crate::protocols::openai::nvext::NvExt;
impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
type Error = anyhow::Error;
......@@ -1199,11 +453,6 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
}
}
}
// ---------------------------------------------------------------------------
// Conversion: NvCreateChatCompletionResponse -> AnthropicMessageResponse
// ---------------------------------------------------------------------------
/// Convert a completed chat completion response into an Anthropic Messages response.
pub fn chat_completion_to_anthropic_response(
chat_resp: NvCreateChatCompletionResponse,
......@@ -1211,7 +460,7 @@ pub fn chat_completion_to_anthropic_response(
) -> AnthropicMessageResponse {
let msg_id = format!("msg_{}", Uuid::new_v4().simple());
let choice = chat_resp.choices.into_iter().next();
let choice = chat_resp.inner.choices.into_iter().next();
let mut content = Vec::new();
let mut stop_reason = None;
......@@ -1282,6 +531,7 @@ pub fn chat_completion_to_anthropic_response(
// Map usage
let usage = chat_resp
.inner
.usage
.map(|u| {
let cache_read_input_tokens = u
......@@ -1308,111 +558,6 @@ pub fn chat_completion_to_anthropic_response(
usage,
}
}
// ---------------------------------------------------------------------------
// Count tokens
// ---------------------------------------------------------------------------
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug, Clone, Deserialize)]
pub struct AnthropicCountTokensRequest {
pub model: String,
pub messages: Vec<AnthropicMessage>,
#[serde(
default,
skip_serializing_if = "Option::is_none",
deserialize_with = "deserialize_system_prompt"
)]
pub system: Option<SystemContent>,
#[serde(default)]
pub tools: Option<Vec<AnthropicTool>>,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug, Clone, Serialize)]
pub struct AnthropicCountTokensResponse {
pub input_tokens: u32,
}
impl AnthropicCountTokensRequest {
/// Estimate input token count using a `len/3` heuristic.
pub fn estimate_tokens(&self) -> u32 {
let mut total_len: usize = 0;
if let Some(system) = &self.system {
total_len += system.text.len();
}
for msg in &self.messages {
// Count role
total_len += match msg.role {
AnthropicRole::User => 4,
AnthropicRole::Assistant => 9,
};
// Count content
match &msg.content {
AnthropicMessageContent::Text { content } => total_len += content.len(),
AnthropicMessageContent::Blocks { content } => {
for block in content {
total_len += estimate_block_len(block);
}
}
}
}
if let Some(tools) = &self.tools {
for tool in tools {
total_len += tool.name.len();
if let Some(desc) = &tool.description {
total_len += desc.len();
}
if let Some(schema) = &tool.input_schema {
total_len += schema.to_string().len();
}
}
}
let tokens = total_len / 3;
if tokens == 0 && total_len > 0 {
1
} else {
tokens as u32
}
}
}
fn estimate_block_len(block: &AnthropicContentBlock) -> usize {
match block {
AnthropicContentBlock::Text { text, .. } => text.len(),
AnthropicContentBlock::ToolUse { name, input, .. } => name.len() + input.to_string().len(),
AnthropicContentBlock::ToolResult { content, .. } => content
.as_ref()
.map(|c| match c {
ToolResultContent::Text(s) => s.len(),
ToolResultContent::Blocks(blocks) => blocks
.iter()
.map(|b| match b {
ToolResultContentBlock::Text { text } => text.len(),
ToolResultContentBlock::Other(v) => v.to_string().len(),
})
.sum(),
})
.unwrap_or(0),
AnthropicContentBlock::Thinking { thinking, .. } => thinking.len(),
AnthropicContentBlock::RedactedThinking { data, .. } => data.len(),
AnthropicContentBlock::ServerToolUse { name, input, .. } => {
name.len() + input.to_string().len()
}
AnthropicContentBlock::WebSearchToolResult { content, .. } => content.to_string().len(),
AnthropicContentBlock::Image { .. } => 256, // rough estimate for image metadata
AnthropicContentBlock::Other(v) => v.to_string().len(),
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
......@@ -1656,38 +801,40 @@ mod tests {
#[test]
fn test_chat_completion_to_anthropic_response() {
let chat_resp = NvCreateChatCompletionResponse {
id: "chatcmpl-xyz".into(),
choices: vec![dynamo_async_openai::types::ChatChoice {
index: 0,
message: dynamo_async_openai::types::ChatCompletionResponseMessage {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"Hello!".to_string(),
inner: dynamo_async_openai::types::CreateChatCompletionResponse {
id: "chatcmpl-xyz".into(),
choices: vec![dynamo_async_openai::types::ChatChoice {
index: 0,
message: dynamo_async_openai::types::ChatCompletionResponseMessage {
content: Some(
dynamo_async_openai::types::ChatCompletionMessageContent::Text(
"Hello!".to_string(),
),
),
),
refusal: None,
tool_calls: None,
role: dynamo_async_openai::types::Role::Assistant,
function_call: None,
audio: None,
reasoning_content: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
}],
created: 1726000000,
model: "test-model".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion".to_string(),
usage: Some(dynamo_async_openai::types::CompletionUsage {
prompt_tokens: 10,
completion_tokens: 5,
total_tokens: 15,
prompt_tokens_details: None,
completion_tokens_details: None,
}),
refusal: None,
tool_calls: None,
role: dynamo_async_openai::types::Role::Assistant,
function_call: None,
audio: None,
reasoning_content: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
}],
created: 1726000000,
model: "test-model".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion".to_string(),
usage: Some(dynamo_async_openai::types::CompletionUsage {
prompt_tokens: 10,
completion_tokens: 5,
total_tokens: 15,
prompt_tokens_details: None,
completion_tokens_details: None,
}),
},
nvext: None,
};
......
......@@ -64,21 +64,24 @@ pub struct NvCreateChatCompletionRequest {
}
/// A response structure for unary chat completion responses, embedding OpenAI's
/// `CreateChatCompletionResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI unary chat completion response, embedded
/// using `serde(flatten)`.
pub type NvCreateChatCompletionResponse = dynamo_async_openai::types::CreateChatCompletionResponse;
/// `CreateChatCompletionResponse` with optional NVIDIA extension metadata.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct NvCreateChatCompletionResponse {
#[serde(flatten)]
pub inner: dynamo_async_openai::types::CreateChatCompletionResponse,
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
/// A response structure for streamed chat completions, embedding OpenAI's
/// `CreateChatCompletionStreamResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI streaming chat completion response, embedded
/// using `serde(flatten)`.
pub type NvCreateChatCompletionStreamResponse =
dynamo_async_openai::types::CreateChatCompletionStreamResponse;
/// `CreateChatCompletionStreamResponse` with optional NVIDIA extension metadata.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct NvCreateChatCompletionStreamResponse {
#[serde(flatten)]
pub inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse,
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
/// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
/// providing access to NVIDIA-specific extensions.
......
......@@ -136,16 +136,16 @@ impl DeltaAggregator {
if aggregator.error.is_none()
&& let Some(delta) = delta.data
{
aggregator.id = delta.id;
aggregator.model = delta.model;
aggregator.created = delta.created;
aggregator.service_tier = delta.service_tier;
aggregator.id = delta.inner.id;
aggregator.model = delta.inner.model;
aggregator.created = delta.inner.created;
aggregator.service_tier = delta.inner.service_tier;
// Aggregate usage statistics if available.
if let Some(usage) = delta.usage {
if let Some(usage) = delta.inner.usage {
aggregator.usage = Some(usage);
}
if let Some(system_fingerprint) = delta.system_fingerprint {
if let Some(system_fingerprint) = delta.inner.system_fingerprint {
aggregator.system_fingerprint = Some(system_fingerprint);
}
......@@ -155,7 +155,7 @@ impl DeltaAggregator {
}
// Aggregate choices incrementally.
for choice in delta.choices {
for choice in delta.inner.choices {
let state_choice =
aggregator
.choices
......@@ -267,14 +267,16 @@ impl DeltaAggregator {
// Construct the final response object.
let response = NvCreateChatCompletionResponse {
id: aggregator.id,
created: aggregator.created,
usage: aggregator.usage,
model: aggregator.model,
object: "chat.completion".to_string(),
system_fingerprint: aggregator.system_fingerprint,
choices,
service_tier: aggregator.service_tier,
inner: dynamo_async_openai::types::CreateChatCompletionResponse {
id: aggregator.id,
created: aggregator.created,
usage: aggregator.usage,
model: aggregator.model,
object: "chat.completion".to_string(),
system_fingerprint: aggregator.system_fingerprint,
choices,
service_tier: aggregator.service_tier,
},
nvext: aggregator.nvext,
};
......@@ -360,7 +362,7 @@ pub trait ChatCompletionAggregator {
) -> Result<NvCreateChatCompletionResponse, String>;
}
impl ChatCompletionAggregator for dynamo_async_openai::types::CreateChatCompletionResponse {
impl ChatCompletionAggregator for NvCreateChatCompletionResponse {
async fn from_annotated_stream(
stream: impl Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>>,
parsing_options: ParsingOptions,
......@@ -445,14 +447,16 @@ mod tests {
};
let data = NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
model: "meta/llama-3.1-8b-instruct".to_string(),
created: 1234567890,
service_tier: None,
usage: None,
system_fingerprint: None,
choices: vec![choice],
object: "chat.completion".to_string(),
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
model: "meta/llama-3.1-8b-instruct".to_string(),
created: 1234567890,
service_tier: None,
usage: None,
system_fingerprint: None,
choices: vec![choice],
object: "chat.completion".to_string(),
},
nvext: None,
};
......@@ -479,13 +483,13 @@ mod tests {
let response = result.unwrap();
// Verify that the response is empty and has default values
assert_eq!(response.id, "");
assert_eq!(response.model, "");
assert_eq!(response.created, 0);
assert!(response.usage.is_none());
assert!(response.system_fingerprint.is_none());
assert_eq!(response.choices.len(), 0);
assert!(response.service_tier.is_none());
assert_eq!(response.inner.id, "");
assert_eq!(response.inner.model, "");
assert_eq!(response.inner.created, 0);
assert!(response.inner.usage.is_none());
assert!(response.inner.system_fingerprint.is_none());
assert_eq!(response.inner.choices.len(), 0);
assert!(response.inner.service_tier.is_none());
}
#[tokio::test]
......@@ -511,13 +515,13 @@ mod tests {
let response = result.unwrap();
// Verify the response fields
assert_eq!(response.id, "test_id");
assert_eq!(response.model, "meta/llama-3.1-8b-instruct");
assert_eq!(response.created, 1234567890);
assert!(response.usage.is_none());
assert!(response.system_fingerprint.is_none());
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.id, "test_id");
assert_eq!(response.inner.model, "meta/llama-3.1-8b-instruct");
assert_eq!(response.inner.created, 1234567890);
assert!(response.inner.usage.is_none());
assert!(response.inner.system_fingerprint.is_none());
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
assert_eq!(choice.index, 0);
assert_eq!(
choice.message.content.as_ref().unwrap(),
......@@ -525,7 +529,7 @@ mod tests {
);
assert!(choice.finish_reason.is_none());
assert_eq!(choice.message.role, dynamo_async_openai::types::Role::User);
assert!(response.service_tier.is_none());
assert!(response.inner.service_tier.is_none());
}
#[tokio::test]
......@@ -562,8 +566,8 @@ mod tests {
let response = result.unwrap();
// Verify the response fields
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
assert_eq!(choice.index, 0);
assert_eq!(
choice.message.content.as_ref().unwrap(),
......@@ -630,8 +634,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
assert_eq!(choice.index, 0);
assert_eq!(
......@@ -653,43 +657,49 @@ mod tests {
// Create a delta with multiple choices
// ALLOW: function_call is deprecated
let data = NvCreateChatCompletionStreamResponse {
id: "test_id".to_string(),
model: "test_model".to_string(),
created: 1234567890,
service_tier: None,
usage: None,
system_fingerprint: None,
choices: vec![
dynamo_async_openai::types::ChatChoiceStream {
index: 0,
delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
role: Some(dynamo_async_openai::types::Role::Assistant),
content: Some(ChatCompletionMessageContent::Text("Choice 0".to_string())),
function_call: None,
tool_calls: None,
refusal: None,
reasoning_content: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: "test_id".to_string(),
model: "test_model".to_string(),
created: 1234567890,
service_tier: None,
usage: None,
system_fingerprint: None,
choices: vec![
dynamo_async_openai::types::ChatChoiceStream {
index: 0,
delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
role: Some(dynamo_async_openai::types::Role::Assistant),
content: Some(ChatCompletionMessageContent::Text(
"Choice 0".to_string(),
)),
function_call: None,
tool_calls: None,
refusal: None,
reasoning_content: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
},
dynamo_async_openai::types::ChatChoiceStream {
index: 1,
delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
role: Some(dynamo_async_openai::types::Role::Assistant),
content: Some(ChatCompletionMessageContent::Text("Choice 1".to_string())),
function_call: None,
tool_calls: None,
refusal: None,
reasoning_content: None,
dynamo_async_openai::types::ChatChoiceStream {
index: 1,
delta: dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
role: Some(dynamo_async_openai::types::Role::Assistant),
content: Some(ChatCompletionMessageContent::Text(
"Choice 1".to_string(),
)),
function_call: None,
tool_calls: None,
refusal: None,
reasoning_content: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
},
finish_reason: Some(dynamo_async_openai::types::FinishReason::Stop),
stop_reason: None,
logprobs: None,
},
],
object: "chat.completion".to_string(),
],
object: "chat.completion".to_string(),
},
nvext: None,
};
......@@ -711,9 +721,9 @@ mod tests {
let mut response = result.unwrap();
// Verify the response fields
assert_eq!(response.choices.len(), 2);
response.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
let choice0 = &response.choices[0];
assert_eq!(response.inner.choices.len(), 2);
response.inner.choices.sort_by(|a, b| a.index.cmp(&b.index)); // Ensure the choices are ordered
let choice0 = &response.inner.choices[0];
assert_eq!(choice0.index, 0);
assert_eq!(
choice0.message.content.as_ref().unwrap(),
......@@ -728,7 +738,7 @@ mod tests {
dynamo_async_openai::types::Role::Assistant
);
let choice1 = &response.choices[1];
let choice1 = &response.inner.choices[1];
assert_eq!(choice1.index, 1);
assert_eq!(
choice1.message.content.as_ref().unwrap(),
......@@ -773,8 +783,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// Verify tool calls are present
assert!(choice.message.tool_calls.is_some());
......@@ -816,8 +826,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// Verify tool calls are present
assert!(choice.message.tool_calls.is_some());
......@@ -859,8 +869,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// Verify tool calls are present
assert!(choice.message.tool_calls.is_some());
......@@ -900,8 +910,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// Verify no tool calls are present
assert!(choice.message.tool_calls.is_none());
......@@ -928,7 +938,7 @@ mod tests {
// Manually set empty tool calls array
if let Some(ref mut data) = annotated_delta.data {
data.choices[0].delta.tool_calls = Some(vec![]); // Empty tool calls array
data.inner.choices[0].delta.tool_calls = Some(vec![]); // Empty tool calls array
}
let data = annotated_delta.data.unwrap();
......@@ -945,8 +955,8 @@ mod tests {
assert!(result.is_ok());
let response = result.unwrap();
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// Verify tool calls array is empty
assert!(choice.message.tool_calls.is_none());
......@@ -992,8 +1002,8 @@ mod tests {
let response = result.unwrap();
// There should be one choice
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// The tool_calls field should be present and parsed
assert!(choice.message.tool_calls.is_some());
......@@ -1050,8 +1060,8 @@ mod tests {
let response = result.unwrap();
// There should be one choice
assert_eq!(response.choices.len(), 1);
let choice = &response.choices[0];
assert_eq!(response.inner.choices.len(), 1);
let choice = &response.inner.choices[0];
// The finish_reason should be ToolCalls, not Stop, because tool calls are present
assert_eq!(
......
......@@ -278,19 +278,21 @@ impl DeltaGenerator {
// According to OpenAI spec: when stream_options.include_usage is true,
// all intermediate chunks should have usage: null
// The final usage chunk will be sent separately with empty choices
dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: self.id.clone(),
object: self.object.clone(),
created: self.created,
model: self.model.clone(),
system_fingerprint: self.system_fingerprint.clone(),
choices,
usage: if self.options.enable_usage && self.options.continuous_usage_stats {
Some(self.get_usage())
} else {
None
NvCreateChatCompletionStreamResponse {
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: self.id.clone(),
object: self.object.clone(),
created: self.created,
model: self.model.clone(),
system_fingerprint: self.system_fingerprint.clone(),
choices,
usage: if self.options.enable_usage && self.options.continuous_usage_stats {
Some(self.get_usage())
} else {
None
},
service_tier: self.service_tier.clone(),
},
service_tier: self.service_tier.clone(),
nvext: None, // Will be populated by router layer if needed
}
}
......@@ -303,15 +305,17 @@ impl DeltaGenerator {
pub fn create_usage_chunk(&self) -> NvCreateChatCompletionStreamResponse {
let usage = self.get_usage();
dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: self.id.clone(),
object: self.object.clone(),
created: self.created,
model: self.model.clone(),
system_fingerprint: self.system_fingerprint.clone(),
choices: vec![], // Empty choices for usage-only chunk
usage: Some(usage),
service_tier: self.service_tier.clone(),
NvCreateChatCompletionStreamResponse {
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: self.id.clone(),
object: self.object.clone(),
created: self.created,
model: self.model.clone(),
system_fingerprint: self.system_fingerprint.clone(),
choices: vec![], // Empty choices for usage-only chunk
usage: Some(usage),
service_tier: self.service_tier.clone(),
},
nvext: None,
}
}
......
......@@ -525,13 +525,13 @@ impl JailedStream {
// Process each item in the stream
while let Some(response) = stream.next().await {
if let Some(chat_response) = response.data.as_ref() {
last_stream_id.clone_from(&chat_response.id);
last_stream_model.clone_from(&chat_response.model);
last_stream_created = chat_response.created;
last_stream_id.clone_from(&chat_response.inner.id);
last_stream_model.clone_from(&chat_response.inner.model);
last_stream_created = chat_response.inner.created;
let mut all_emissions = Vec::new();
if chat_response.choices.is_empty() {
if chat_response.inner.choices.is_empty() {
// No choices processed (e.g., usage-only chunk)
// Pass through as-is to preserve usage and other metadata
yield response;
......@@ -539,7 +539,7 @@ impl JailedStream {
}
// Process each choice independently using the new architecture
for choice in &chat_response.choices {
for choice in &chat_response.inner.choices {
if let Some(ref content) = choice.delta.content {
// Jailing only applies to text content
let text_content = match content {
......@@ -676,14 +676,16 @@ impl JailedStream {
tracing::debug!("Stream ended while jailed, releasing accumulated content");
// Create a finalization response carrying forward real stream metadata
let dummy_response = NvCreateChatCompletionStreamResponse {
id: last_stream_id,
object: "chat.completion.chunk".to_string(),
created: last_stream_created,
model: last_stream_model,
choices: Vec::new(),
usage: None,
service_tier: None,
system_fingerprint: None,
inner: dynamo_async_openai::types::CreateChatCompletionStreamResponse {
id: last_stream_id,
object: "chat.completion.chunk".to_string(),
created: last_stream_created,
model: last_stream_model,
choices: Vec::new(),
usage: None,
service_tier: None,
system_fingerprint: None,
},
nvext: None,
};
......@@ -713,7 +715,7 @@ impl JailedStream {
EmissionMode::Packed => {
// Pack all choices into a single response
let mut response = base_response.clone();
response.choices = emissions.into_iter().map(|e| e.into_choice()).collect();
response.inner.choices = emissions.into_iter().map(|e| e.into_choice()).collect();
vec![Annotated {
data: Some(response),
......@@ -729,7 +731,7 @@ impl JailedStream {
.into_iter()
.map(|emission| {
let mut response = base_response.clone();
response.choices = vec![emission.into_choice()];
response.inner.choices = vec![emission.into_choice()];
Annotated {
data: Some(response),
......@@ -1013,7 +1015,7 @@ impl JailedStream {
while let Some(mut response) = input_stream.next().await {
// Track if any choice emitted tool calls
if let Some(ref data) = response.data {
for choice in &data.choices {
for choice in &data.inner.choices {
if choice.delta.tool_calls.is_some() {
has_tool_calls_per_choice.insert(choice.index, true);
}
......@@ -1022,7 +1024,7 @@ impl JailedStream {
// Fix finish_reason based on jail mode and whether tool calls were emitted
if let Some(ref mut data) = response.data {
for choice in &mut data.choices {
for choice in &mut data.inner.choices {
if let Some(finish) = choice.finish_reason {
// Only modify Stop finish reason, preserve Length/ContentFilter
if finish == FinishReason::Stop {
......
......@@ -48,6 +48,8 @@ pub struct NvCreateCompletionRequest {
pub struct NvCreateCompletionResponse {
#[serde(flatten)]
pub inner: dynamo_async_openai::types::CreateCompletionResponse,
#[serde(skip_serializing_if = "Option::is_none")]
pub nvext: Option<serde_json::Value>,
}
impl ContentProvider for dynamo_async_openai::types::Choice {
......@@ -296,9 +298,8 @@ impl ResponseFactory {
choices: vec![choice],
system_fingerprint: self.system_fingerprint.clone(),
usage,
nvext: None, // Will be populated by router layer if needed
};
NvCreateCompletionResponse { inner }
NvCreateCompletionResponse { inner, nvext: None }
}
}
......
......@@ -86,8 +86,8 @@ impl DeltaAggregator {
aggregator.system_fingerprint = Some(system_fingerprint);
}
// Aggregate nvext field (take the last non-None value)
if delta.inner.nvext.is_some() {
aggregator.nvext = delta.inner.nvext;
if delta.nvext.is_some() {
aggregator.nvext = delta.nvext;
}
// handle the choices
......@@ -168,10 +168,12 @@ impl DeltaAggregator {
object: "text_completion".to_string(),
system_fingerprint: aggregator.system_fingerprint,
choices,
nvext: aggregator.nvext,
};
let response = NvCreateCompletionResponse { inner };
let response = NvCreateCompletionResponse {
inner,
nvext: aggregator.nvext,
};
Ok(response)
}
......@@ -256,10 +258,9 @@ mod tests {
logprobs,
}],
object: "text_completion".to_string(),
nvext: None,
};
let response = NvCreateCompletionResponse { inner };
let response = NvCreateCompletionResponse { inner, nvext: None };
Annotated {
data: Some(response),
......@@ -387,10 +388,9 @@ mod tests {
},
],
object: "text_completion".to_string(),
nvext: None,
};
let response = NvCreateCompletionResponse { inner };
let response = NvCreateCompletionResponse { inner, nvext: None };
let annotated_delta = Annotated {
data: Some(response),
......
......@@ -218,10 +218,9 @@ impl DeltaGenerator {
} else {
None
},
nvext: None, // Will be populated by router layer if needed
};
NvCreateCompletionResponse { inner }
NvCreateCompletionResponse { inner, nvext: None }
}
/// Creates a final usage-only chunk for OpenAI compliance.
......@@ -240,10 +239,9 @@ impl DeltaGenerator {
system_fingerprint: self.system_fingerprint.clone(),
choices: vec![], // Empty choices for usage-only chunk
usage: Some(usage),
nvext: None, // Will be populated by router layer if needed
};
NvCreateCompletionResponse { inner }
NvCreateCompletionResponse { inner, nvext: None }
}
/// Check if usage tracking is enabled
......@@ -343,7 +341,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
};
if let Ok(nvext_json) = serde_json::to_value(&nvext_response) {
response.inner.nvext = Some(nvext_json);
response.nvext = Some(nvext_json);
if let Some(ref info) = worker_id_info {
tracing::debug!(
"Injected worker_id into completions nvext: prefill={:?}, decode={:?}",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment