Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2887cd1c
Unverified
Commit
2887cd1c
authored
Mar 30, 2026
by
ishandhanani
Committed by
GitHub
Mar 30, 2026
Browse files
refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)
parent
d6136f4a
Changes
32
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1379 additions
and
1322 deletions
+1379
-1322
lib/async-openai/src/types/anthropic.rs
lib/async-openai/src/types/anthropic.rs
+869
-0
lib/async-openai/src/types/chat.rs
lib/async-openai/src/types/chat.rs
+0
-8
lib/async-openai/src/types/completion.rs
lib/async-openai/src/types/completion.rs
+0
-4
lib/async-openai/src/types/mod.rs
lib/async-openai/src/types/mod.rs
+1
-0
lib/llm/src/audit/stream.rs
lib/llm/src/audit/stream.rs
+95
-81
lib/llm/src/entrypoint/input/batch.rs
lib/llm/src/entrypoint/input/batch.rs
+3
-2
lib/llm/src/entrypoint/input/text.rs
lib/llm/src/entrypoint/input/text.rs
+3
-2
lib/llm/src/http/service/openai.rs
lib/llm/src/http/service/openai.rs
+24
-20
lib/llm/src/perf/logprobs.rs
lib/llm/src/perf/logprobs.rs
+76
-68
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+1
-1
lib/llm/src/preprocessor/speculative_prefill.rs
lib/llm/src/preprocessor/speculative_prefill.rs
+1
-1
lib/llm/src/protocols/anthropic/stream_converter.rs
lib/llm/src/protocols/anthropic/stream_converter.rs
+81
-75
lib/llm/src/protocols/anthropic/types.rs
lib/llm/src/protocols/anthropic/types.rs
+44
-897
lib/llm/src/protocols/openai/chat_completions.rs
lib/llm/src/protocols/openai/chat_completions.rs
+16
-13
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
+107
-97
lib/llm/src/protocols/openai/chat_completions/delta.rs
lib/llm/src/protocols/openai/chat_completions/delta.rs
+25
-21
lib/llm/src/protocols/openai/chat_completions/jail.rs
lib/llm/src/protocols/openai/chat_completions/jail.rs
+19
-17
lib/llm/src/protocols/openai/completions.rs
lib/llm/src/protocols/openai/completions.rs
+3
-2
lib/llm/src/protocols/openai/completions/aggregator.rs
lib/llm/src/protocols/openai/completions/aggregator.rs
+8
-8
lib/llm/src/protocols/openai/completions/delta.rs
lib/llm/src/protocols/openai/completions/delta.rs
+3
-5
No files found.
lib/async-openai/src/types/anthropic.rs
0 → 100644
View file @
2887cd1c
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API types.
//!
//! Pure protocol types for the `/v1/messages` endpoint -- request, response,
//! streaming events, error shapes, and count-tokens types.
use
serde
::{
Deserialize
,
Serialize
};
use
utoipa
::
ToSchema
;
/// Anthropic-style cache control hint for prefix pinning with TTL.
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
Default,
PartialEq)]
pub
struct
CacheControl
{
#[serde(rename
=
"type"
)]
pub
control_type
:
CacheControlType
,
/// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
ttl
:
Option
<
String
>
,
}
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
Default,
PartialEq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
CacheControlType
{
#[default]
Ephemeral
,
#[serde(other)]
Unknown
,
}
const
MIN_TTL_SECONDS
:
u64
=
300
;
const
MAX_TTL_SECONDS
:
u64
=
3600
;
impl
CacheControl
{
/// Parse TTL string to seconds, clamped to [300, 3600].
///
/// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
/// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
/// Unrecognized strings default to 300s.
pub
fn
ttl_seconds
(
&
self
)
->
u64
{
let
raw
=
match
self
.ttl
.as_deref
()
{
None
=>
return
MIN_TTL_SECONDS
,
Some
(
"5m"
)
=>
300
,
Some
(
"1h"
)
=>
3600
,
Some
(
other
)
=>
match
other
.parse
::
<
u64
>
()
{
Ok
(
secs
)
=>
secs
,
Err
(
_
)
=>
{
tracing
::
warn!
(
"Unrecognized TTL '{}', defaulting to 300s"
,
other
);
return
MIN_TTL_SECONDS
;
}
},
};
raw
.clamp
(
MIN_TTL_SECONDS
,
MAX_TTL_SECONDS
)
}
}
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
SystemContent
{
/// The concatenated text from all system blocks (or the plain string).
pub
text
:
String
,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn
deserialize_system_prompt
<
'de
,
D
>
(
deserializer
:
D
)
->
Result
<
Option
<
SystemContent
>
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum
SystemPrompt
{
Text
(
String
),
Blocks
(
Vec
<
SystemBlock
>
),
}
#[derive(Deserialize)]
struct
SystemBlock
{
text
:
String
,
#[serde(default)]
cache_control
:
Option
<
CacheControl
>
,
}
let
maybe
:
Option
<
SystemPrompt
>
=
Option
::
deserialize
(
deserializer
)
?
;
Ok
(
maybe
.map
(|
sp
|
match
sp
{
SystemPrompt
::
Text
(
s
)
=>
SystemContent
{
text
:
s
,
cache_control
:
None
,
},
SystemPrompt
::
Blocks
(
blocks
)
=>
{
let
cache_control
=
blocks
.iter
()
.rev
()
.find_map
(|
b
|
b
.cache_control
.clone
());
let
text
=
blocks
.into_iter
()
.map
(|
b
|
b
.text
)
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
SystemContent
{
text
,
cache_control
,
}
}
}))
}
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicCreateMessageRequest
{
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub
model
:
String
,
/// The maximum number of tokens to generate.
pub
max_tokens
:
u32
,
/// The conversation messages.
pub
messages
:
Vec
<
AnthropicMessage
>
,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
temperature
:
Option
<
f32
>
,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_p
:
Option
<
f32
>
,
/// Top-K sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_k
:
Option
<
u32
>
,
/// Custom stop sequences.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequences
:
Option
<
Vec
<
String
>>
,
/// Whether to stream the response.
#[serde(default)]
pub
stream
:
bool
,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
metadata
:
Option
<
serde_json
::
Value
>
,
/// Tools the model may call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tool_choice
:
Option
<
AnthropicToolChoice
>
,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be >= 1024 and < max_tokens).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
thinking
:
Option
<
ThinkingConfig
>
,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
service_tier
:
Option
<
String
>
,
/// Container identifier for stateful sandbox sessions.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
container
:
Option
<
String
>
,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
output_config
:
Option
<
serde_json
::
Value
>
,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
ThinkingConfig
{
/// Either `"enabled"` or `"disabled"`.
#[serde(rename
=
"type"
)]
pub
thinking_type
:
String
,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
budget_tokens
:
Option
<
u32
>
,
}
/// A single message in the conversation.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessage
{
pub
role
:
AnthropicRole
,
#[serde(flatten)]
pub
content
:
AnthropicMessageContent
,
}
/// The role of a message sender.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicRole
{
User
,
Assistant
,
}
/// Message content -- either a plain string or an array of content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicMessageContent
{
/// Plain text content.
Text
{
content
:
String
},
/// Array of structured content blocks.
Blocks
{
content
:
Vec
<
AnthropicContentBlock
>
},
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug,
Clone,
Serialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicContentBlock
{
/// Text content block. May optionally include `citations` -- references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Image content block.
#[serde(rename
=
"image"
)]
Image
{
source
:
AnthropicImageSource
},
/// Tool use request from assistant.
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Tool result from user.
#[serde(rename
=
"tool_result"
)]
ToolResult
{
tool_use_id
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
content
:
Option
<
ToolResultContent
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
is_error
:
Option
<
bool
>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Content of a `tool_result` block -- either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContent
{
Text
(
String
),
Blocks
(
Vec
<
ToolResultContentBlock
>
),
}
impl
ToolResultContent
{
/// Extract the text content, concatenating array blocks if needed.
pub
fn
into_text
(
self
)
->
String
{
match
self
{
ToolResultContent
::
Text
(
s
)
=>
s
,
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.into_iter
()
.filter_map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
Some
(
text
),
ToolResultContentBlock
::
Other
(
_
)
=>
None
,
})
.collect
::
<
Vec
<
_
>>
()
.join
(
""
),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContentBlock
{
Text
{
text
:
String
,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other
(
serde_json
::
Value
),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl
<
'de
>
Deserialize
<
'de
>
for
AnthropicContentBlock
{
fn
deserialize
<
D
>
(
deserializer
:
D
)
->
Result
<
Self
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
let
value
=
serde_json
::
Value
::
deserialize
(
deserializer
)
?
;
let
block_type
=
value
.get
(
"type"
)
.and_then
(|
t
|
t
.as_str
())
.unwrap_or
(
""
)
.to_string
();
match
block_type
.as_str
()
{
"text"
=>
{
let
text
=
value
.get
(
"text"
)
.and_then
(|
t
|
t
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"text"
))
?
.to_string
();
let
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
=
value
.get
(
"citations"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Text
{
text
,
citations
,
cache_control
,
})
}
"image"
=>
{
let
source
:
AnthropicImageSource
=
serde_json
::
from_value
(
value
.get
(
"source"
)
.cloned
()
.unwrap_or_default
())
.map_err
(
serde
::
de
::
Error
::
custom
)
?
;
Ok
(
AnthropicContentBlock
::
Image
{
source
})
}
"tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
,
cache_control
,
})
}
"tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
:
Option
<
ToolResultContent
>
=
value
.get
(
"content"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
is_error
=
value
.get
(
"is_error"
)
.and_then
(|
v
|
v
.as_bool
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolResult
{
tool_use_id
,
content
,
is_error
,
cache_control
,
})
}
"thinking"
=>
{
let
thinking
=
value
.get
(
"thinking"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"thinking"
))
?
.to_string
();
let
signature
=
value
.get
(
"signature"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"signature"
))
?
.to_string
();
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Thinking
{
thinking
,
signature
,
cache_control
,
})
}
"redacted_thinking"
=>
{
let
data
=
value
.get
(
"data"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"data"
))
?
.to_string
();
Ok
(
AnthropicContentBlock
::
RedactedThinking
{
data
})
}
"server_tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
Ok
(
AnthropicContentBlock
::
ServerToolUse
{
id
,
name
,
input
})
}
"web_search_tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
=
value
.get
(
"content"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
([]));
Ok
(
AnthropicContentBlock
::
WebSearchToolResult
{
tool_use_id
,
content
,
})
}
other
=>
{
tracing
::
debug!
(
"Unrecognized Anthropic content block type '{}', preserving as Other"
,
other
);
Ok
(
AnthropicContentBlock
::
Other
(
value
))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicImageSource
{
#[serde(rename
=
"type"
)]
pub
source_type
:
String
,
pub
media_type
:
String
,
pub
data
:
String
,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicTool
{
/// Tool name (required for client tools, present on server tools too).
pub
name
:
String
,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default,
rename
=
"type"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
tool_type
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
description
:
Option
<
String
>
,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
input_schema
:
Option
<
serde_json
::
Value
>
,
/// Cache control breakpoint on this tool definition.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Tool choice specification.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicToolChoice
{
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named
(
AnthropicToolChoiceNamed
),
/// Simple mode: "auto", "any", or "none".
Simple
(
AnthropicToolChoiceSimple
),
}
/// Simple tool choice modes.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceSimple
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicToolChoiceMode
{
Auto
,
Any
,
None
,
Tool
,
}
/// Named tool choice.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceNamed
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
pub
name
:
String
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageResponse
{
pub
id
:
String
,
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
role
:
String
,
pub
content
:
Vec
<
AnthropicResponseContentBlock
>
,
pub
model
:
String
,
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
pub
stop_sequence
:
Option
<
String
>
,
pub
usage
:
AnthropicUsage
,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicResponseContentBlock
{
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
},
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
},
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Token usage information.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Default)]
pub
struct
AnthropicUsage
{
pub
input_tokens
:
u32
,
pub
output_tokens
:
u32
,
/// Number of input tokens used to create a new cache entry.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_creation_input_tokens
:
Option
<
u32
>
,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_read_input_tokens
:
Option
<
u32
>
,
}
/// Reason the model stopped generating.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
AnthropicStopReason
{
EndTurn
,
MaxTokens
,
StopSequence
,
ToolUse
,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn
,
/// The model refused to generate content (safety refusal).
Refusal
,
}
/// SSE event types for the Anthropic streaming API.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicStreamEvent
{
#[serde(rename
=
"message_start"
)]
MessageStart
{
message
:
AnthropicMessageResponse
},
#[serde(rename
=
"content_block_start"
)]
ContentBlockStart
{
index
:
u32
,
content_block
:
AnthropicResponseContentBlock
,
},
#[serde(rename
=
"content_block_delta"
)]
ContentBlockDelta
{
index
:
u32
,
delta
:
AnthropicDelta
},
#[serde(rename
=
"content_block_stop"
)]
ContentBlockStop
{
index
:
u32
},
#[serde(rename
=
"message_delta"
)]
MessageDelta
{
delta
:
AnthropicMessageDeltaBody
,
usage
:
AnthropicUsage
,
},
#[serde(rename
=
"message_stop"
)]
MessageStop
{},
#[serde(rename
=
"ping"
)]
Ping
{},
#[serde(rename
=
"error"
)]
Error
{
error
:
AnthropicErrorBody
},
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicDelta
{
#[serde(rename
=
"thinking_delta"
)]
ThinkingDelta
{
thinking
:
String
},
#[serde(rename
=
"text_delta"
)]
TextDelta
{
text
:
String
},
#[serde(rename
=
"input_json_delta"
)]
InputJsonDelta
{
partial_json
:
String
},
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename
=
"signature_delta"
)]
SignatureDelta
{
signature
:
String
},
/// Incremental citation attached to a text block.
#[serde(rename
=
"citations_delta"
)]
CitationsDelta
{
citation
:
serde_json
::
Value
},
}
/// The delta body in a message_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageDeltaBody
{
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequence
:
Option
<
String
>
,
}
/// Anthropic API error response wrapper.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorResponse
{
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
error
:
AnthropicErrorBody
,
}
/// Error body within an error response.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorBody
{
#[serde(rename
=
"type"
)]
pub
error_type
:
String
,
pub
message
:
String
,
}
impl
AnthropicErrorResponse
{
/// Create an `invalid_request_error` response.
pub
fn
invalid_request
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"invalid_request_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub
fn
api_error
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"api_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create a `not_found_error` response.
pub
fn
not_found
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"not_found_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
}
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Deserialize)]
pub
struct
AnthropicCountTokensRequest
{
pub
model
:
String
,
pub
messages
:
Vec
<
AnthropicMessage
>
,
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
#[serde(default)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Serialize)]
pub
struct
AnthropicCountTokensResponse
{
pub
input_tokens
:
u32
,
}
impl
AnthropicCountTokensRequest
{
/// Estimate input token count using a `len/3` heuristic.
pub
fn
estimate_tokens
(
&
self
)
->
u32
{
let
mut
total_len
:
usize
=
0
;
if
let
Some
(
system
)
=
&
self
.system
{
total_len
+=
system
.text
.len
();
}
for
msg
in
&
self
.messages
{
// Count role
total_len
+=
match
msg
.role
{
AnthropicRole
::
User
=>
4
,
AnthropicRole
::
Assistant
=>
9
,
};
// Count content
match
&
msg
.content
{
AnthropicMessageContent
::
Text
{
content
}
=>
total_len
+=
content
.len
(),
AnthropicMessageContent
::
Blocks
{
content
}
=>
{
for
block
in
content
{
total_len
+=
estimate_block_len
(
block
);
}
}
}
}
if
let
Some
(
tools
)
=
&
self
.tools
{
for
tool
in
tools
{
total_len
+=
tool
.name
.len
();
if
let
Some
(
desc
)
=
&
tool
.description
{
total_len
+=
desc
.len
();
}
if
let
Some
(
schema
)
=
&
tool
.input_schema
{
total_len
+=
schema
.to_string
()
.len
();
}
}
}
let
tokens
=
total_len
/
3
;
if
tokens
==
0
&&
total_len
>
0
{
1
}
else
{
tokens
as
u32
}
}
}
fn
estimate_block_len
(
block
:
&
AnthropicContentBlock
)
->
usize
{
match
block
{
AnthropicContentBlock
::
Text
{
text
,
..
}
=>
text
.len
(),
AnthropicContentBlock
::
ToolUse
{
name
,
input
,
..
}
=>
name
.len
()
+
input
.to_string
()
.len
(),
AnthropicContentBlock
::
ToolResult
{
content
,
..
}
=>
content
.as_ref
()
.map
(|
c
|
match
c
{
ToolResultContent
::
Text
(
s
)
=>
s
.len
(),
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.iter
()
.map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
text
.len
(),
ToolResultContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
})
.sum
(),
})
.unwrap_or
(
0
),
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
thinking
.len
(),
AnthropicContentBlock
::
RedactedThinking
{
data
,
..
}
=>
data
.len
(),
AnthropicContentBlock
::
ServerToolUse
{
name
,
input
,
..
}
=>
{
name
.len
()
+
input
.to_string
()
.len
()
}
AnthropicContentBlock
::
WebSearchToolResult
{
content
,
..
}
=>
content
.to_string
()
.len
(),
AnthropicContentBlock
::
Image
{
..
}
=>
256
,
// rough estimate for image metadata
AnthropicContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
}
}
lib/async-openai/src/types/chat.rs
View file @
2887cd1c
...
...
@@ -1182,10 +1182,6 @@ pub struct CreateChatCompletionResponse {
/// The object type, which is always `chat.completion`.
pub
object
:
String
,
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
...
...
@@ -1281,10 +1277,6 @@ pub struct CreateChatCompletionStreamResponse {
/// An optional field that will only be present when you set `stream_options: {"include_usage": true}` in your request.
/// When present, it contains a null value except for the last chunk which contains the token usage statistics for the entire request.
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
#[cfg(test)]
...
...
lib/async-openai/src/types/completion.rs
View file @
2887cd1c
...
...
@@ -224,10 +224,6 @@ pub struct CreateCompletionResponse {
/// The object type, which is always "text_completion"
pub
object
:
String
,
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
...
...
lib/async-openai/src/types/mod.rs
View file @
2887cd1c
...
...
@@ -10,6 +10,7 @@
//! Types used in OpenAI API requests and responses.
//! These types are created from component schemas in the [OpenAPI spec](https://github.com/openai/openai-openapi)
pub
mod
anthropic
;
mod
assistant
;
mod
assistant_impls
;
mod
assistant_stream
;
...
...
lib/llm/src/audit/stream.rs
View file @
2887cd1c
...
...
@@ -90,14 +90,16 @@ where
tracing
::
warn!
(
"audit: aggregation future canceled/failed"
);
// Return minimal response if aggregation failed
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
}
})
...
...
@@ -125,14 +127,16 @@ where
Err
(
e
)
=>
{
tracing
::
warn!
(
"fold aggregation failed: {e}"
);
let
fallback
=
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
};
let
_
=
tx
.send
(
fallback
.clone
());
...
...
@@ -145,14 +149,16 @@ where
rx
.await
.unwrap_or_else
(|
_
|
{
tracing
::
warn!
(
"fold aggregation future canceled"
);
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
}
})
...
...
@@ -171,8 +177,8 @@ pub fn final_response_to_one_chunk_stream(
)
->
std
::
pin
::
Pin
<
Box
<
dyn
futures
::
Stream
<
Item
=
Annotated
<
NvCreateChatCompletionStreamResponse
>>
+
Send
>
,
>
{
let
mut
choices
:
Vec
<
ChatChoiceStream
>
=
Vec
::
with_capacity
(
resp
.choices
.len
());
for
(
idx
,
ch
)
in
resp
.choices
.iter
()
.enumerate
()
{
let
mut
choices
:
Vec
<
ChatChoiceStream
>
=
Vec
::
with_capacity
(
resp
.
inner.
choices
.len
());
for
(
idx
,
ch
)
in
resp
.
inner.
choices
.iter
()
.enumerate
()
{
// Convert FunctionCall to FunctionCallStream if present
#[allow(deprecated)]
let
function_call
=
ch
.message.function_call
.as_ref
()
.map
(|
fc
|
{
...
...
@@ -222,14 +228,16 @@ pub fn final_response_to_one_chunk_stream(
}
let
chunk
=
NvCreateChatCompletionStreamResponse
{
id
:
resp
.id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
:
resp
.created
,
model
:
resp
.model
.clone
(),
system_fingerprint
:
resp
.system_fingerprint
.clone
(),
service_tier
:
resp
.service_tier
.clone
(),
choices
,
usage
:
resp
.usage
.clone
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
resp
.inner.id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
:
resp
.inner.created
,
model
:
resp
.inner.model
.clone
(),
system_fingerprint
:
resp
.inner.system_fingerprint
.clone
(),
service_tier
:
resp
.inner.service_tier
.clone
(),
choices
,
usage
:
resp
.inner.usage
.clone
(),
},
nvext
:
resp
.nvext
.clone
(),
};
...
...
@@ -275,14 +283,16 @@ mod tests {
};
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
...
...
@@ -314,14 +324,16 @@ mod tests {
};
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
...
...
@@ -339,7 +351,7 @@ mod tests {
chunk
.data
.as_ref
()
.and_then
(|
d
|
d
.choices
.first
())
.and_then
(|
d
|
d
.
inner.
choices
.first
())
.and_then
(|
c
|
c
.delta.content
.as_ref
())
.and_then
(|
content
|
match
content
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
Some
(
text
.clone
()),
...
...
@@ -396,7 +408,7 @@ mod tests {
assert_eq!
(
results
.len
(),
0
,
"Empty stream should produce no chunks"
);
// Verify fallback response (aggregation will fail on empty stream)
assert_eq!
(
final_resp
.object
,
"chat.completion"
);
assert_eq!
(
final_resp
.
inner.
object
,
"chat.completion"
);
// Should get fallback response, not panic
}
...
...
@@ -415,7 +427,7 @@ mod tests {
assert_eq!
(
extract_content
(
&
results
[
0
]),
"Single chunk"
);
// Verify aggregation
assert_eq!
(
final_resp
.object
,
"chat.completion"
);
assert_eq!
(
final_resp
.
inner.
object
,
"chat.completion"
);
}
#[tokio::test]
...
...
@@ -423,32 +435,34 @@ mod tests {
// Test that metadata (id, event, comment) is preserved through passthrough
let
chunk_with_metadata
=
Annotated
{
data
:
Some
(
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[{
#[allow(deprecated)]
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
role
:
Some
(
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Content"
.to_string
(),
)),
tool_calls
:
None
,
function_call
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[{
#[allow(deprecated)]
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
role
:
Some
(
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Content"
.to_string
(),
)),
tool_calls
:
None
,
function_call
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
}),
id
:
Some
(
"correlation-123"
.to_string
()),
...
...
@@ -481,7 +495,7 @@ mod tests {
let
(
resp1
,
resp2
)
=
tokio
::
join!
(
future1
,
future2
);
// Both should complete successfully
assert_eq!
(
resp1
.object
,
"chat.completion"
);
assert_eq!
(
resp2
.object
,
"chat.completion"
);
assert_eq!
(
resp1
.
inner.
object
,
"chat.completion"
);
assert_eq!
(
resp2
.
inner.
object
,
"chat.completion"
);
}
}
lib/llm/src/entrypoint/input/batch.rs
View file @
2887cd1c
...
...
@@ -238,8 +238,9 @@ async fn evaluate(
match
(
item
.data
.as_ref
(),
item
.event
.as_deref
())
{
(
Some
(
data
),
_
)
=>
{
// Normal case
let
choice
=
data
.choices
.first
();
let
chat_comp
=
choice
.as_ref
()
.unwrap
();
let
Some
(
chat_comp
)
=
data
.inner.choices
.first
()
else
{
continue
;
};
if
let
Some
(
c
)
=
&
chat_comp
.delta.content
{
match
c
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
{
...
...
lib/llm/src/entrypoint/input/text.rs
View file @
2887cd1c
...
...
@@ -138,8 +138,9 @@ async fn main_loop(
match
(
item
.data
.as_ref
(),
item
.event
.as_deref
())
{
(
Some
(
data
),
_
)
=>
{
// Normal case
let
entry
=
data
.choices
.first
();
let
chat_comp
=
entry
.as_ref
()
.unwrap
();
let
Some
(
chat_comp
)
=
data
.inner.choices
.first
()
else
{
continue
;
};
if
let
Some
(
c
)
=
&
chat_comp
.delta.content
{
match
c
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
{
...
...
lib/llm/src/http/service/openai.rs
View file @
2887cd1c
...
...
@@ -991,7 +991,7 @@ fn streaming_tool_dispatch_events(
};
let
mut
events
=
vec!
[];
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
let
Some
(
tool_calls
)
=
&
choice
.delta.tool_calls
else
{
continue
;
};
...
...
@@ -1034,7 +1034,7 @@ fn accumulate_reasoning_dispatch(
};
let
mut
events
=
vec!
[];
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
let
buffer
=
buffers
.entry
(
choice
.index
)
.or_default
();
let
has_reasoning
=
choice
.delta
...
...
@@ -2892,15 +2892,17 @@ mod tests {
// Create a normal data event
let
normal_event
=
Annotated
::
<
NvCreateChatCompletionStreamResponse
>
{
data
:
Some
(
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[],
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
service_tier
:
None
,
usage
:
None
,
data
:
Some
(
NvCreateChatCompletionStreamResponse
{
inner
:
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[],
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
service_tier
:
None
,
usage
:
None
,
},
nvext
:
None
,
}),
id
:
Some
(
"msg-1"
.to_string
()),
...
...
@@ -3162,15 +3164,17 @@ mod tests {
fn
make_stream_response
(
choices
:
Vec
<
ChatChoiceStream
>
,
)
->
Annotated
<
NvCreateChatCompletionStreamResponse
>
{
let
response
=
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
,
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
let
response
=
NvCreateChatCompletionStreamResponse
{
inner
:
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
,
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
Annotated
{
...
...
lib/llm/src/perf/logprobs.rs
View file @
2887cd1c
...
...
@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
fn
extract_logprobs_by_choice
(
&
self
)
->
HashMap
<
u32
,
Vec
<
TokenLogProbs
>>
{
let
mut
result
=
HashMap
::
new
();
for
choice
in
&
self
.choices
{
for
choice
in
&
self
.
inner.
choices
{
let
choice_index
=
choice
.index
;
let
choice_logprobs
=
choice
...
...
@@ -949,34 +949,36 @@ mod tests {
)
->
NvCreateChatCompletionStreamResponse
{
#[expect(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
),
),
)
,
function
_call
:
None
,
tool_calls
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
Some
(
ChatChoiceL
ogprobs
{
content
:
Some
(
token_logprobs
)
,
refusal
:
None
,
}
)
,
}]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
function_call
:
None
,
tool
_call
s
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
Some
(
ChatChoiceLogprobs
{
content
:
Some
(
token_l
ogprobs
),
refusal
:
None
,
})
,
}
]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
}
,
nvext
:
None
,
}
}
...
...
@@ -1012,14 +1014,16 @@ mod tests {
.collect
();
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
,
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
,
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -1341,31 +1345,33 @@ mod tests {
// Test with choice that has no logprobs
#[expect(deprecated)]
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
),
),
)
,
function
_call
:
None
,
tool_calls
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
None
,
// No logprobs
}]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
function_call
:
None
,
tool
_call
s
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
None
,
// No logprobs
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
}
,
nvext
:
None
,
};
...
...
@@ -1573,14 +1579,16 @@ mod tests {
// In practice, this would have real logprobs data
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
lib/llm/src/preprocessor.rs
View file @
2887cd1c
...
...
@@ -1217,7 +1217,7 @@ impl OpenAIPreprocessor {
let
processed_response
=
if
let
Some
(
ref
mut
parser
)
=
state
.reasoning_parser
{
response
.map_data
(|
mut
data
|
{
// Process all choices, not just the first one
for
choice
in
data
.choices
.iter_mut
()
{
for
choice
in
data
.
inner.
choices
.iter_mut
()
{
// Reasoning parsing only applies to text content
if
let
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
...
...
lib/llm/src/preprocessor/speculative_prefill.rs
View file @
2887cd1c
...
...
@@ -111,7 +111,7 @@ pub fn maybe_wrap_stream(
let
mut
prefill_tx
=
Some
(
tx
);
Box
::
pin
(
stream
.map
(
move
|
item
|
{
if
let
Some
(
ref
resp
)
=
item
.data
{
for
choice
in
&
resp
.choices
{
for
choice
in
&
resp
.
inner.
choices
{
if
let
Some
(
ChatCompletionMessageContent
::
Text
(
ref
text
))
=
choice
.delta.content
{
accumulated_text
.push_str
(
text
);
}
...
...
lib/llm/src/protocols/anthropic/stream_converter.rs
View file @
2887cd1c
...
...
@@ -106,7 +106,7 @@ impl AnthropicStreamConverter {
let
mut
events
=
Vec
::
new
();
// Capture real token usage from engine when available (typically on the final chunk).
if
let
Some
(
usage
)
=
&
chunk
.usage
{
if
let
Some
(
usage
)
=
&
chunk
.
inner.
usage
{
self
.input_token_count
=
usage
.prompt_tokens
;
self
.output_token_count
=
usage
.completion_tokens
;
self
.cached_token_count
=
usage
...
...
@@ -115,7 +115,7 @@ impl AnthropicStreamConverter {
.and_then
(|
d
|
d
.cached_tokens
);
}
for
choice
in
&
chunk
.choices
{
for
choice
in
&
chunk
.
inner.
choices
{
let
delta
=
&
choice
.delta
;
// Track finish reason
...
...
@@ -444,7 +444,7 @@ impl AnthropicStreamConverter {
)
->
Vec
<
TaggedEvent
>
{
let
mut
events
=
Vec
::
new
();
if
let
Some
(
usage
)
=
&
chunk
.usage
{
if
let
Some
(
usage
)
=
&
chunk
.
inner.
usage
{
self
.input_token_count
=
usage
.prompt_tokens
;
self
.output_token_count
=
usage
.completion_tokens
;
self
.cached_token_count
=
usage
...
...
@@ -453,7 +453,7 @@ impl AnthropicStreamConverter {
.and_then
(|
d
|
d
.cached_tokens
);
}
for
choice
in
&
chunk
.choices
{
for
choice
in
&
chunk
.
inner.
choices
{
let
delta
=
&
choice
.delta
;
if
let
Some
(
ref
fr
)
=
choice
.finish_reason
{
...
...
@@ -722,27 +722,29 @@ mod tests {
fn
text_chunk
(
text
:
&
str
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
text
.into
())),
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
text
.into
())),
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -755,35 +757,37 @@ mod tests {
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
Some
(
vec!
[
ChatCompletionMessageToolCallChunk
{
index
:
tc_index
,
id
:
id
.map
(
String
::
from
),
r
#
type
:
Some
(
ChatCompletionToolType
::
Function
),
function
:
Some
(
FunctionCallStream
{
name
:
name
.map
(
String
::
from
),
arguments
:
args
.map
(
String
::
from
),
}),
}]),
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
Some
(
vec!
[
ChatCompletionMessageToolCallChunk
{
index
:
tc_index
,
id
:
id
.map
(
String
::
from
),
r
#
type
:
Some
(
ChatCompletionToolType
::
Function
),
function
:
Some
(
FunctionCallStream
{
name
:
name
.map
(
String
::
from
),
arguments
:
args
.map
(
String
::
from
),
}),
}]),
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -908,27 +912,29 @@ mod tests {
fn
reasoning_chunk
(
text
:
&
str
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
Some
(
text
.into
()),
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
Some
(
text
.into
()),
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
lib/llm/src/protocols/anthropic/types.rs
View file @
2887cd1c
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API
types and
conversion logic.
//! Anthropic Messages API conversion logic.
//!
//! All request/response types for the `/v1/messages` endpoint, plus
//! bidirectional conversion to/from the internal chat completions format.
//! Pure protocol types live in `dynamo_async_openai::types::anthropic`.
//! This module provides bidirectional conversion to/from the internal
//! chat completions format used by the Dynamo engine.
// Re-export all pure Anthropic protocol types so existing `use crate::protocols::anthropic::*`
// continues to work throughout dynamo-llm.
pub
use
dynamo_async_openai
::
types
::
anthropic
::
*
;
use
dynamo_async_openai
::
types
::{
ChatCompletionMessageToolCall
,
ChatCompletionNamedToolChoice
,
...
...
@@ -17,764 +22,13 @@ use dynamo_async_openai::types::{
ChatCompletionTool
,
ChatCompletionToolChoiceOption
,
ChatCompletionToolType
,
FunctionName
,
FunctionObject
,
ImageUrl
,
ReasoningContent
,
};
use
serde
::{
Deserialize
,
Serialize
};
use
uuid
::
Uuid
;
use
crate
::
protocols
::
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionResponse
,
};
use
crate
::
protocols
::
openai
::
common_ext
::
CommonExt
;
use
crate
::
protocols
::
openai
::
nvext
::{
CacheControl
,
NvExt
};
// ---------------------------------------------------------------------------
// Custom deserializers
// ---------------------------------------------------------------------------
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
SystemContent
{
/// The concatenated text from all system blocks (or the plain string).
pub
text
:
String
,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn
deserialize_system_prompt
<
'de
,
D
>
(
deserializer
:
D
)
->
Result
<
Option
<
SystemContent
>
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum
SystemPrompt
{
Text
(
String
),
Blocks
(
Vec
<
SystemBlock
>
),
}
#[derive(Deserialize)]
struct
SystemBlock
{
text
:
String
,
#[serde(default)]
cache_control
:
Option
<
CacheControl
>
,
}
let
maybe
:
Option
<
SystemPrompt
>
=
Option
::
deserialize
(
deserializer
)
?
;
Ok
(
maybe
.map
(|
sp
|
match
sp
{
SystemPrompt
::
Text
(
s
)
=>
SystemContent
{
text
:
s
,
cache_control
:
None
,
},
SystemPrompt
::
Blocks
(
blocks
)
=>
{
let
cache_control
=
blocks
.iter
()
.rev
()
.find_map
(|
b
|
b
.cache_control
.clone
());
let
text
=
blocks
.into_iter
()
.map
(|
b
|
b
.text
)
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
SystemContent
{
text
,
cache_control
,
}
}
}))
}
// ---------------------------------------------------------------------------
// Request types
// ---------------------------------------------------------------------------
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicCreateMessageRequest
{
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub
model
:
String
,
/// The maximum number of tokens to generate.
pub
max_tokens
:
u32
,
/// The conversation messages.
pub
messages
:
Vec
<
AnthropicMessage
>
,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
temperature
:
Option
<
f32
>
,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_p
:
Option
<
f32
>
,
/// Top-K sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_k
:
Option
<
u32
>
,
/// Custom stop sequences.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequences
:
Option
<
Vec
<
String
>>
,
/// Whether to stream the response.
#[serde(default)]
pub
stream
:
bool
,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
metadata
:
Option
<
serde_json
::
Value
>
,
/// Tools the model may call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tool_choice
:
Option
<
AnthropicToolChoice
>
,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be ≥ 1024 and < max_tokens).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
thinking
:
Option
<
ThinkingConfig
>
,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
service_tier
:
Option
<
String
>
,
/// Container identifier for stateful sandbox sessions.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
container
:
Option
<
String
>
,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
output_config
:
Option
<
serde_json
::
Value
>
,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
ThinkingConfig
{
/// Either `"enabled"` or `"disabled"`.
#[serde(rename
=
"type"
)]
pub
thinking_type
:
String
,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
budget_tokens
:
Option
<
u32
>
,
}
/// A single message in the conversation.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessage
{
pub
role
:
AnthropicRole
,
#[serde(flatten)]
pub
content
:
AnthropicMessageContent
,
}
/// The role of a message sender.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicRole
{
User
,
Assistant
,
}
/// Message content — either a plain string or an array of content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicMessageContent
{
/// Plain text content.
Text
{
content
:
String
},
/// Array of structured content blocks.
Blocks
{
content
:
Vec
<
AnthropicContentBlock
>
},
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug,
Clone,
Serialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicContentBlock
{
/// Text content block. May optionally include `citations` — references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Image content block.
#[serde(rename
=
"image"
)]
Image
{
source
:
AnthropicImageSource
},
/// Tool use request from assistant.
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Tool result from user.
#[serde(rename
=
"tool_result"
)]
ToolResult
{
tool_use_id
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
content
:
Option
<
ToolResultContent
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
is_error
:
Option
<
bool
>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Content of a `tool_result` block — either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContent
{
Text
(
String
),
Blocks
(
Vec
<
ToolResultContentBlock
>
),
}
impl
ToolResultContent
{
/// Extract the text content, concatenating array blocks if needed.
pub
fn
into_text
(
self
)
->
String
{
match
self
{
ToolResultContent
::
Text
(
s
)
=>
s
,
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.into_iter
()
.filter_map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
Some
(
text
),
ToolResultContentBlock
::
Other
(
_
)
=>
None
,
})
.collect
::
<
Vec
<
_
>>
()
.join
(
""
),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContentBlock
{
Text
{
text
:
String
,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other
(
serde_json
::
Value
),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl
<
'de
>
Deserialize
<
'de
>
for
AnthropicContentBlock
{
fn
deserialize
<
D
>
(
deserializer
:
D
)
->
Result
<
Self
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
let
value
=
serde_json
::
Value
::
deserialize
(
deserializer
)
?
;
let
block_type
=
value
.get
(
"type"
)
.and_then
(|
t
|
t
.as_str
())
.unwrap_or
(
""
)
.to_string
();
match
block_type
.as_str
()
{
"text"
=>
{
let
text
=
value
.get
(
"text"
)
.and_then
(|
t
|
t
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"text"
))
?
.to_string
();
let
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
=
value
.get
(
"citations"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Text
{
text
,
citations
,
cache_control
,
})
}
"image"
=>
{
let
source
:
AnthropicImageSource
=
serde_json
::
from_value
(
value
.get
(
"source"
)
.cloned
()
.unwrap_or_default
())
.map_err
(
serde
::
de
::
Error
::
custom
)
?
;
Ok
(
AnthropicContentBlock
::
Image
{
source
})
}
"tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
,
cache_control
,
})
}
"tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
:
Option
<
ToolResultContent
>
=
value
.get
(
"content"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
is_error
=
value
.get
(
"is_error"
)
.and_then
(|
v
|
v
.as_bool
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolResult
{
tool_use_id
,
content
,
is_error
,
cache_control
,
})
}
"thinking"
=>
{
let
thinking
=
value
.get
(
"thinking"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"thinking"
))
?
.to_string
();
let
signature
=
value
.get
(
"signature"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"signature"
))
?
.to_string
();
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Thinking
{
thinking
,
signature
,
cache_control
,
})
}
"redacted_thinking"
=>
{
let
data
=
value
.get
(
"data"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"data"
))
?
.to_string
();
Ok
(
AnthropicContentBlock
::
RedactedThinking
{
data
})
}
"server_tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
Ok
(
AnthropicContentBlock
::
ServerToolUse
{
id
,
name
,
input
})
}
"web_search_tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
=
value
.get
(
"content"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
([]));
Ok
(
AnthropicContentBlock
::
WebSearchToolResult
{
tool_use_id
,
content
,
})
}
other
=>
{
tracing
::
debug!
(
"Unrecognized Anthropic content block type '{}', preserving as Other"
,
other
);
Ok
(
AnthropicContentBlock
::
Other
(
value
))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicImageSource
{
#[serde(rename
=
"type"
)]
pub
source_type
:
String
,
pub
media_type
:
String
,
pub
data
:
String
,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicTool
{
/// Tool name (required for client tools, present on server tools too).
pub
name
:
String
,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default,
rename
=
"type"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
tool_type
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
description
:
Option
<
String
>
,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
input_schema
:
Option
<
serde_json
::
Value
>
,
/// Cache control breakpoint on this tool definition.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Tool choice specification.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicToolChoice
{
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named
(
AnthropicToolChoiceNamed
),
/// Simple mode: "auto", "any", or "none".
Simple
(
AnthropicToolChoiceSimple
),
}
/// Simple tool choice modes.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceSimple
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicToolChoiceMode
{
Auto
,
Any
,
None
,
Tool
,
}
/// Named tool choice.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceNamed
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
pub
name
:
String
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageResponse
{
pub
id
:
String
,
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
role
:
String
,
pub
content
:
Vec
<
AnthropicResponseContentBlock
>
,
pub
model
:
String
,
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
pub
stop_sequence
:
Option
<
String
>
,
pub
usage
:
AnthropicUsage
,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicResponseContentBlock
{
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
},
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
},
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Token usage information.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Default)]
pub
struct
AnthropicUsage
{
pub
input_tokens
:
u32
,
pub
output_tokens
:
u32
,
/// Number of input tokens used to create a new cache entry.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_creation_input_tokens
:
Option
<
u32
>
,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_read_input_tokens
:
Option
<
u32
>
,
}
/// Reason the model stopped generating.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
AnthropicStopReason
{
EndTurn
,
MaxTokens
,
StopSequence
,
ToolUse
,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn
,
/// The model refused to generate content (safety refusal).
Refusal
,
}
// ---------------------------------------------------------------------------
// Streaming types
// ---------------------------------------------------------------------------
/// SSE event types for the Anthropic streaming API.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicStreamEvent
{
#[serde(rename
=
"message_start"
)]
MessageStart
{
message
:
AnthropicMessageResponse
},
#[serde(rename
=
"content_block_start"
)]
ContentBlockStart
{
index
:
u32
,
content_block
:
AnthropicResponseContentBlock
,
},
#[serde(rename
=
"content_block_delta"
)]
ContentBlockDelta
{
index
:
u32
,
delta
:
AnthropicDelta
},
#[serde(rename
=
"content_block_stop"
)]
ContentBlockStop
{
index
:
u32
},
#[serde(rename
=
"message_delta"
)]
MessageDelta
{
delta
:
AnthropicMessageDeltaBody
,
usage
:
AnthropicUsage
,
},
#[serde(rename
=
"message_stop"
)]
MessageStop
{},
#[serde(rename
=
"ping"
)]
Ping
{},
#[serde(rename
=
"error"
)]
Error
{
error
:
AnthropicErrorBody
},
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicDelta
{
#[serde(rename
=
"thinking_delta"
)]
ThinkingDelta
{
thinking
:
String
},
#[serde(rename
=
"text_delta"
)]
TextDelta
{
text
:
String
},
#[serde(rename
=
"input_json_delta"
)]
InputJsonDelta
{
partial_json
:
String
},
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename
=
"signature_delta"
)]
SignatureDelta
{
signature
:
String
},
/// Incremental citation attached to a text block.
#[serde(rename
=
"citations_delta"
)]
CitationsDelta
{
citation
:
serde_json
::
Value
},
}
/// The delta body in a message_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageDeltaBody
{
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequence
:
Option
<
String
>
,
}
// ---------------------------------------------------------------------------
// Error types
// ---------------------------------------------------------------------------
/// Anthropic API error response wrapper.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorResponse
{
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
error
:
AnthropicErrorBody
,
}
/// Error body within an error response.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorBody
{
#[serde(rename
=
"type"
)]
pub
error_type
:
String
,
pub
message
:
String
,
}
impl
AnthropicErrorResponse
{
/// Create an `invalid_request_error` response.
pub
fn
invalid_request
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"invalid_request_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub
fn
api_error
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"api_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create a `not_found_error` response.
pub
fn
not_found
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"not_found_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
}
// ---------------------------------------------------------------------------
// Conversion: AnthropicCreateMessageRequest -> NvCreateChatCompletionRequest
// ---------------------------------------------------------------------------
use
crate
::
protocols
::
openai
::
nvext
::
NvExt
;
impl
TryFrom
<
AnthropicCreateMessageRequest
>
for
NvCreateChatCompletionRequest
{
type
Error
=
anyhow
::
Error
;
...
...
@@ -1199,11 +453,6 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
}
}
}
// ---------------------------------------------------------------------------
// Conversion: NvCreateChatCompletionResponse -> AnthropicMessageResponse
// ---------------------------------------------------------------------------
/// Convert a completed chat completion response into an Anthropic Messages response.
pub
fn
chat_completion_to_anthropic_response
(
chat_resp
:
NvCreateChatCompletionResponse
,
...
...
@@ -1211,7 +460,7 @@ pub fn chat_completion_to_anthropic_response(
)
->
AnthropicMessageResponse
{
let
msg_id
=
format!
(
"msg_{}"
,
Uuid
::
new_v4
()
.simple
());
let
choice
=
chat_resp
.choices
.into_iter
()
.next
();
let
choice
=
chat_resp
.
inner.
choices
.into_iter
()
.next
();
let
mut
content
=
Vec
::
new
();
let
mut
stop_reason
=
None
;
...
...
@@ -1282,6 +531,7 @@ pub fn chat_completion_to_anthropic_response(
// Map usage
let
usage
=
chat_resp
.inner
.usage
.map
(|
u
|
{
let
cache_read_input_tokens
=
u
...
...
@@ -1308,111 +558,6 @@ pub fn chat_completion_to_anthropic_response(
usage
,
}
}
// ---------------------------------------------------------------------------
// Count tokens
// ---------------------------------------------------------------------------
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Deserialize)]
pub
struct
AnthropicCountTokensRequest
{
pub
model
:
String
,
pub
messages
:
Vec
<
AnthropicMessage
>
,
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
#[serde(default)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Serialize)]
pub
struct
AnthropicCountTokensResponse
{
pub
input_tokens
:
u32
,
}
impl
AnthropicCountTokensRequest
{
/// Estimate input token count using a `len/3` heuristic.
pub
fn
estimate_tokens
(
&
self
)
->
u32
{
let
mut
total_len
:
usize
=
0
;
if
let
Some
(
system
)
=
&
self
.system
{
total_len
+=
system
.text
.len
();
}
for
msg
in
&
self
.messages
{
// Count role
total_len
+=
match
msg
.role
{
AnthropicRole
::
User
=>
4
,
AnthropicRole
::
Assistant
=>
9
,
};
// Count content
match
&
msg
.content
{
AnthropicMessageContent
::
Text
{
content
}
=>
total_len
+=
content
.len
(),
AnthropicMessageContent
::
Blocks
{
content
}
=>
{
for
block
in
content
{
total_len
+=
estimate_block_len
(
block
);
}
}
}
}
if
let
Some
(
tools
)
=
&
self
.tools
{
for
tool
in
tools
{
total_len
+=
tool
.name
.len
();
if
let
Some
(
desc
)
=
&
tool
.description
{
total_len
+=
desc
.len
();
}
if
let
Some
(
schema
)
=
&
tool
.input_schema
{
total_len
+=
schema
.to_string
()
.len
();
}
}
}
let
tokens
=
total_len
/
3
;
if
tokens
==
0
&&
total_len
>
0
{
1
}
else
{
tokens
as
u32
}
}
}
fn
estimate_block_len
(
block
:
&
AnthropicContentBlock
)
->
usize
{
match
block
{
AnthropicContentBlock
::
Text
{
text
,
..
}
=>
text
.len
(),
AnthropicContentBlock
::
ToolUse
{
name
,
input
,
..
}
=>
name
.len
()
+
input
.to_string
()
.len
(),
AnthropicContentBlock
::
ToolResult
{
content
,
..
}
=>
content
.as_ref
()
.map
(|
c
|
match
c
{
ToolResultContent
::
Text
(
s
)
=>
s
.len
(),
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.iter
()
.map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
text
.len
(),
ToolResultContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
})
.sum
(),
})
.unwrap_or
(
0
),
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
thinking
.len
(),
AnthropicContentBlock
::
RedactedThinking
{
data
,
..
}
=>
data
.len
(),
AnthropicContentBlock
::
ServerToolUse
{
name
,
input
,
..
}
=>
{
name
.len
()
+
input
.to_string
()
.len
()
}
AnthropicContentBlock
::
WebSearchToolResult
{
content
,
..
}
=>
content
.to_string
()
.len
(),
AnthropicContentBlock
::
Image
{
..
}
=>
256
,
// rough estimate for image metadata
AnthropicContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod
tests
{
use
super
::
*
;
...
...
@@ -1656,38 +801,40 @@ mod tests {
#[test]
fn
test_chat_completion_to_anthropic_response
()
{
let
chat_resp
=
NvCreateChatCompletionResponse
{
id
:
"chatcmpl-xyz"
.into
(),
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoice
{
index
:
0
,
message
:
dynamo_async_openai
::
types
::
ChatCompletionResponseMessage
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"Hello!"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
"chatcmpl-xyz"
.into
(),
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoice
{
index
:
0
,
message
:
dynamo_async_openai
::
types
::
ChatCompletionResponseMessage
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"Hello!"
.to_string
(),
),
),
)
,
refusal
:
None
,
tool_calls
:
None
,
role
:
dynamo_async_openai
::
types
::
Role
::
Assistant
,
function_call
:
None
,
audio
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
None
,
}]
,
created
:
1726000000
,
model
:
"test-model"
.into
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion"
.to_string
(),
usage
:
Some
(
dynamo_async_openai
::
types
::
CompletionUsage
{
prompt
_tokens
:
10
,
completion
_tokens
:
5
,
total_tokens
:
15
,
prompt
_tokens_details
:
None
,
completion_tokens_details
:
None
,
}
)
,
refusal
:
None
,
tool_calls
:
None
,
role
:
dynamo_async_openai
::
types
::
Role
::
Assistant
,
function_call
:
None
,
audio
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
None
,
}]
,
created
:
1726000000
,
model
:
"test-model"
.into
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion"
.to_string
()
,
usage
:
Some
(
dynamo_async_openai
::
types
::
CompletionUsage
{
prompt_tokens
:
10
,
completion
_tokens
:
5
,
total
_tokens
:
1
5
,
prompt_tokens_details
:
None
,
completion
_tokens_details
:
None
,
})
,
},
nvext
:
None
,
};
...
...
lib/llm/src/protocols/openai/chat_completions.rs
View file @
2887cd1c
...
...
@@ -64,21 +64,24 @@ pub struct NvCreateChatCompletionRequest {
}
/// A response structure for unary chat completion responses, embedding OpenAI's
/// `CreateChatCompletionResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI unary chat completion response, embedded
/// using `serde(flatten)`.
pub
type
NvCreateChatCompletionResponse
=
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
;
/// `CreateChatCompletionResponse` with optional NVIDIA extension metadata.
#[derive(Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
pub
struct
NvCreateChatCompletionResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// A response structure for streamed chat completions, embedding OpenAI's
/// `CreateChatCompletionStreamResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI streaming chat completion response, embedded
/// using `serde(flatten)`.
pub
type
NvCreateChatCompletionStreamResponse
=
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
;
/// `CreateChatCompletionStreamResponse` with optional NVIDIA extension metadata.
#[derive(Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
pub
struct
NvCreateChatCompletionStreamResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
/// providing access to NVIDIA-specific extensions.
...
...
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
View file @
2887cd1c
...
...
@@ -136,16 +136,16 @@ impl DeltaAggregator {
if
aggregator
.error
.is_none
()
&&
let
Some
(
delta
)
=
delta
.data
{
aggregator
.id
=
delta
.id
;
aggregator
.model
=
delta
.model
;
aggregator
.created
=
delta
.created
;
aggregator
.service_tier
=
delta
.service_tier
;
aggregator
.id
=
delta
.
inner.
id
;
aggregator
.model
=
delta
.
inner.
model
;
aggregator
.created
=
delta
.
inner.
created
;
aggregator
.service_tier
=
delta
.
inner.
service_tier
;
// Aggregate usage statistics if available.
if
let
Some
(
usage
)
=
delta
.usage
{
if
let
Some
(
usage
)
=
delta
.
inner.
usage
{
aggregator
.usage
=
Some
(
usage
);
}
if
let
Some
(
system_fingerprint
)
=
delta
.system_fingerprint
{
if
let
Some
(
system_fingerprint
)
=
delta
.
inner.
system_fingerprint
{
aggregator
.system_fingerprint
=
Some
(
system_fingerprint
);
}
...
...
@@ -155,7 +155,7 @@ impl DeltaAggregator {
}
// Aggregate choices incrementally.
for
choice
in
delta
.choices
{
for
choice
in
delta
.
inner.
choices
{
let
state_choice
=
aggregator
.choices
...
...
@@ -267,14 +267,16 @@ impl DeltaAggregator {
// Construct the final response object.
let
response
=
NvCreateChatCompletionResponse
{
id
:
aggregator
.id
,
created
:
aggregator
.created
,
usage
:
aggregator
.usage
,
model
:
aggregator
.model
,
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
service_tier
:
aggregator
.service_tier
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
aggregator
.id
,
created
:
aggregator
.created
,
usage
:
aggregator
.usage
,
model
:
aggregator
.model
,
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
service_tier
:
aggregator
.service_tier
,
},
nvext
:
aggregator
.nvext
,
};
...
...
@@ -360,7 +362,7 @@ pub trait ChatCompletionAggregator {
)
->
Result
<
NvCreateChatCompletionResponse
,
String
>
;
}
impl
ChatCompletionAggregator
for
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
impl
ChatCompletionAggregator
for
Nv
CreateChatCompletionResponse
{
async
fn
from_annotated_stream
(
stream
:
impl
Stream
<
Item
=
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
parsing_options
:
ParsingOptions
,
...
...
@@ -445,14 +447,16 @@ mod tests {
};
let
data
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"meta/llama-3.1-8b-instruct"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
choice
],
object
:
"chat.completion"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"meta/llama-3.1-8b-instruct"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
choice
],
object
:
"chat.completion"
.to_string
(),
},
nvext
:
None
,
};
...
...
@@ -479,13 +483,13 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify that the response is empty and has default values
assert_eq!
(
response
.id
,
""
);
assert_eq!
(
response
.model
,
""
);
assert_eq!
(
response
.created
,
0
);
assert
!
(
response
.usage
.is_none
());
assert
!
(
response
.system_fingerprint
.is_none
());
assert_eq!
(
response
.choices
.len
(),
0
);
assert
!
(
response
.service_tier
.is_none
());
assert_eq!
(
response
.
inner.
id
,
""
);
assert_eq!
(
response
.
inner.
model
,
""
);
assert_eq!
(
response
.
inner.
created
,
0
);
assert
!
(
response
.
inner.
usage
.is_none
());
assert
!
(
response
.
inner.
system_fingerprint
.is_none
());
assert_eq!
(
response
.
inner.
choices
.len
(),
0
);
assert
!
(
response
.
inner.
service_tier
.is_none
());
}
#[tokio::test]
...
...
@@ -511,13 +515,13 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.id
,
"test_id"
);
assert_eq!
(
response
.model
,
"meta/llama-3.1-8b-instruct"
);
assert_eq!
(
response
.created
,
1234567890
);
assert
!
(
response
.usage
.is_none
());
assert
!
(
response
.system_fingerprint
.is_none
());
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
id
,
"test_id"
);
assert_eq!
(
response
.
inner.
model
,
"meta/llama-3.1-8b-instruct"
);
assert_eq!
(
response
.
inner.
created
,
1234567890
);
assert
!
(
response
.
inner.
usage
.is_none
());
assert
!
(
response
.
inner.
system_fingerprint
.is_none
());
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
choice
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -525,7 +529,7 @@ mod tests {
);
assert
!
(
choice
.finish_reason
.is_none
());
assert_eq!
(
choice
.message.role
,
dynamo_async_openai
::
types
::
Role
::
User
);
assert
!
(
response
.service_tier
.is_none
());
assert
!
(
response
.
inner.
service_tier
.is_none
());
}
#[tokio::test]
...
...
@@ -562,8 +566,8 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
choice
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -630,8 +634,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
...
...
@@ -653,43 +657,49 @@ mod tests {
// Create a delta with multiple choices
// ALLOW: function_call is deprecated
let
data
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"test_model"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
0
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 0"
.to_string
())),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"test_model"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
0
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 0"
.to_string
(),
)),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
1
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 1"
.to_string
())),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
1
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 1"
.to_string
(),
)),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
],
object
:
"chat.completion"
.to_string
(),
],
object
:
"chat.completion"
.to_string
(),
},
nvext
:
None
,
};
...
...
@@ -711,9 +721,9 @@ mod tests {
let
mut
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.choices
.len
(),
2
);
response
.choices
.sort_by
(|
a
,
b
|
a
.index
.cmp
(
&
b
.index
));
// Ensure the choices are ordered
let
choice0
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
2
);
response
.
inner.
choices
.sort_by
(|
a
,
b
|
a
.index
.cmp
(
&
b
.index
));
// Ensure the choices are ordered
let
choice0
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice0
.index
,
0
);
assert_eq!
(
choice0
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -728,7 +738,7 @@ mod tests {
dynamo_async_openai
::
types
::
Role
::
Assistant
);
let
choice1
=
&
response
.choices
[
1
];
let
choice1
=
&
response
.
inner.
choices
[
1
];
assert_eq!
(
choice1
.index
,
1
);
assert_eq!
(
choice1
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -773,8 +783,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -816,8 +826,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -859,8 +869,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -900,8 +910,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify no tool calls are present
assert
!
(
choice
.message.tool_calls
.is_none
());
...
...
@@ -928,7 +938,7 @@ mod tests {
// Manually set empty tool calls array
if
let
Some
(
ref
mut
data
)
=
annotated_delta
.data
{
data
.choices
[
0
]
.delta.tool_calls
=
Some
(
vec!
[]);
// Empty tool calls array
data
.
inner.
choices
[
0
]
.delta.tool_calls
=
Some
(
vec!
[]);
// Empty tool calls array
}
let
data
=
annotated_delta
.data
.unwrap
();
...
...
@@ -945,8 +955,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls array is empty
assert
!
(
choice
.message.tool_calls
.is_none
());
...
...
@@ -992,8 +1002,8 @@ mod tests {
let
response
=
result
.unwrap
();
// There should be one choice
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// The tool_calls field should be present and parsed
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -1050,8 +1060,8 @@ mod tests {
let
response
=
result
.unwrap
();
// There should be one choice
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// The finish_reason should be ToolCalls, not Stop, because tool calls are present
assert_eq!
(
...
...
lib/llm/src/protocols/openai/chat_completions/delta.rs
View file @
2887cd1c
...
...
@@ -278,19 +278,21 @@ impl DeltaGenerator {
// According to OpenAI spec: when stream_options.include_usage is true,
// all intermediate chunks should have usage: null
// The final usage chunk will be sent separately with empty choices
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
,
usage
:
if
self
.options.enable_usage
&&
self
.options.continuous_usage_stats
{
Some
(
self
.get_usage
())
}
else
{
None
NvCreateChatCompletionStreamResponse
{
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
,
usage
:
if
self
.options.enable_usage
&&
self
.options.continuous_usage_stats
{
Some
(
self
.get_usage
())
}
else
{
None
},
service_tier
:
self
.service_tier
.clone
(),
},
service_tier
:
self
.service_tier
.clone
(),
nvext
:
None
,
// Will be populated by router layer if needed
}
}
...
...
@@ -303,15 +305,17 @@ impl DeltaGenerator {
pub
fn
create_usage_chunk
(
&
self
)
->
NvCreateChatCompletionStreamResponse
{
let
usage
=
self
.get_usage
();
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
service_tier
:
self
.service_tier
.clone
(),
NvCreateChatCompletionStreamResponse
{
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
service_tier
:
self
.service_tier
.clone
(),
},
nvext
:
None
,
}
}
...
...
lib/llm/src/protocols/openai/chat_completions/jail.rs
View file @
2887cd1c
...
...
@@ -525,13 +525,13 @@ impl JailedStream {
// Process each item in the stream
while
let
Some
(
response
)
=
stream
.next
()
.await
{
if
let
Some
(
chat_response
)
=
response
.data
.as_ref
()
{
last_stream_id
.clone_from
(
&
chat_response
.id
);
last_stream_model
.clone_from
(
&
chat_response
.model
);
last_stream_created
=
chat_response
.created
;
last_stream_id
.clone_from
(
&
chat_response
.
inner.
id
);
last_stream_model
.clone_from
(
&
chat_response
.
inner.
model
);
last_stream_created
=
chat_response
.
inner.
created
;
let
mut
all_emissions
=
Vec
::
new
();
if
chat_response
.choices
.is_empty
()
{
if
chat_response
.
inner.
choices
.is_empty
()
{
// No choices processed (e.g., usage-only chunk)
// Pass through as-is to preserve usage and other metadata
yield
response
;
...
...
@@ -539,7 +539,7 @@ impl JailedStream {
}
// Process each choice independently using the new architecture
for
choice
in
&
chat_response
.choices
{
for
choice
in
&
chat_response
.
inner.
choices
{
if
let
Some
(
ref
content
)
=
choice
.delta.content
{
// Jailing only applies to text content
let
text_content
=
match
content
{
...
...
@@ -676,14 +676,16 @@ impl JailedStream {
tracing
::
debug!
(
"Stream ended while jailed, releasing accumulated content"
);
// Create a finalization response carrying forward real stream metadata
let
dummy_response
=
NvCreateChatCompletionStreamResponse
{
id
:
last_stream_id
,
object
:
"chat.completion.chunk"
.to_string
(),
created
:
last_stream_created
,
model
:
last_stream_model
,
choices
:
Vec
::
new
(),
usage
:
None
,
service_tier
:
None
,
system_fingerprint
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
last_stream_id
,
object
:
"chat.completion.chunk"
.to_string
(),
created
:
last_stream_created
,
model
:
last_stream_model
,
choices
:
Vec
::
new
(),
usage
:
None
,
service_tier
:
None
,
system_fingerprint
:
None
,
},
nvext
:
None
,
};
...
...
@@ -713,7 +715,7 @@ impl JailedStream {
EmissionMode
::
Packed
=>
{
// Pack all choices into a single response
let
mut
response
=
base_response
.clone
();
response
.choices
=
emissions
.into_iter
()
.map
(|
e
|
e
.into_choice
())
.collect
();
response
.
inner.
choices
=
emissions
.into_iter
()
.map
(|
e
|
e
.into_choice
())
.collect
();
vec!
[
Annotated
{
data
:
Some
(
response
),
...
...
@@ -729,7 +731,7 @@ impl JailedStream {
.into_iter
()
.map
(|
emission
|
{
let
mut
response
=
base_response
.clone
();
response
.choices
=
vec!
[
emission
.into_choice
()];
response
.
inner.
choices
=
vec!
[
emission
.into_choice
()];
Annotated
{
data
:
Some
(
response
),
...
...
@@ -1013,7 +1015,7 @@ impl JailedStream {
while
let
Some
(
mut
response
)
=
input_stream
.next
()
.await
{
// Track if any choice emitted tool calls
if
let
Some
(
ref
data
)
=
response
.data
{
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
if
choice
.delta.tool_calls
.is_some
()
{
has_tool_calls_per_choice
.insert
(
choice
.index
,
true
);
}
...
...
@@ -1022,7 +1024,7 @@ impl JailedStream {
// Fix finish_reason based on jail mode and whether tool calls were emitted
if
let
Some
(
ref
mut
data
)
=
response
.data
{
for
choice
in
&
mut
data
.choices
{
for
choice
in
&
mut
data
.
inner.
choices
{
if
let
Some
(
finish
)
=
choice
.finish_reason
{
// Only modify Stop finish reason, preserve Length/ContentFilter
if
finish
==
FinishReason
::
Stop
{
...
...
lib/llm/src/protocols/openai/completions.rs
View file @
2887cd1c
...
...
@@ -48,6 +48,8 @@ pub struct NvCreateCompletionRequest {
pub
struct
NvCreateCompletionResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateCompletionResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
impl
ContentProvider
for
dynamo_async_openai
::
types
::
Choice
{
...
...
@@ -296,9 +298,8 @@ impl ResponseFactory {
choices
:
vec!
[
choice
],
system_fingerprint
:
self
.system_fingerprint
.clone
(),
usage
,
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
}
...
...
lib/llm/src/protocols/openai/completions/aggregator.rs
View file @
2887cd1c
...
...
@@ -86,8 +86,8 @@ impl DeltaAggregator {
aggregator
.system_fingerprint
=
Some
(
system_fingerprint
);
}
// Aggregate nvext field (take the last non-None value)
if
delta
.
inner.
nvext
.is_some
()
{
aggregator
.nvext
=
delta
.
inner.
nvext
;
if
delta
.nvext
.is_some
()
{
aggregator
.nvext
=
delta
.nvext
;
}
// handle the choices
...
...
@@ -168,10 +168,12 @@ impl DeltaAggregator {
object
:
"text_completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
nvext
:
aggregator
.nvext
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
aggregator
.nvext
,
};
Ok
(
response
)
}
...
...
@@ -256,10 +258,9 @@ mod tests {
logprobs
,
}],
object
:
"text_completion"
.to_string
(),
nvext
:
None
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
None
};
Annotated
{
data
:
Some
(
response
),
...
...
@@ -387,10 +388,9 @@ mod tests {
},
],
object
:
"text_completion"
.to_string
(),
nvext
:
None
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
None
};
let
annotated_delta
=
Annotated
{
data
:
Some
(
response
),
...
...
lib/llm/src/protocols/openai/completions/delta.rs
View file @
2887cd1c
...
...
@@ -218,10 +218,9 @@ impl DeltaGenerator {
}
else
{
None
},
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
/// Creates a final usage-only chunk for OpenAI compliance.
...
...
@@ -240,10 +239,9 @@ impl DeltaGenerator {
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
/// Check if usage tracking is enabled
...
...
@@ -343,7 +341,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
};
if
let
Ok
(
nvext_json
)
=
serde_json
::
to_value
(
&
nvext_response
)
{
response
.
inner.
nvext
=
Some
(
nvext_json
);
response
.nvext
=
Some
(
nvext_json
);
if
let
Some
(
ref
info
)
=
worker_id_info
{
tracing
::
debug!
(
"Injected worker_id into completions nvext: prefill={:?}, decode={:?}"
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment