Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2887cd1c
"vscode:/vscode.git/clone" did not exist on "b889948c3869f0f4bdc8901dc0c5113380f9db2a"
Unverified
Commit
2887cd1c
authored
Mar 30, 2026
by
ishandhanani
Committed by
GitHub
Mar 30, 2026
Browse files
refactor(1/3): move `nvext` to `dynamo-llm` and move `anthropic` to `dynamo-async-openai` (#7564)
parent
d6136f4a
Changes
32
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1379 additions
and
1322 deletions
+1379
-1322
lib/async-openai/src/types/anthropic.rs
lib/async-openai/src/types/anthropic.rs
+869
-0
lib/async-openai/src/types/chat.rs
lib/async-openai/src/types/chat.rs
+0
-8
lib/async-openai/src/types/completion.rs
lib/async-openai/src/types/completion.rs
+0
-4
lib/async-openai/src/types/mod.rs
lib/async-openai/src/types/mod.rs
+1
-0
lib/llm/src/audit/stream.rs
lib/llm/src/audit/stream.rs
+95
-81
lib/llm/src/entrypoint/input/batch.rs
lib/llm/src/entrypoint/input/batch.rs
+3
-2
lib/llm/src/entrypoint/input/text.rs
lib/llm/src/entrypoint/input/text.rs
+3
-2
lib/llm/src/http/service/openai.rs
lib/llm/src/http/service/openai.rs
+24
-20
lib/llm/src/perf/logprobs.rs
lib/llm/src/perf/logprobs.rs
+76
-68
lib/llm/src/preprocessor.rs
lib/llm/src/preprocessor.rs
+1
-1
lib/llm/src/preprocessor/speculative_prefill.rs
lib/llm/src/preprocessor/speculative_prefill.rs
+1
-1
lib/llm/src/protocols/anthropic/stream_converter.rs
lib/llm/src/protocols/anthropic/stream_converter.rs
+81
-75
lib/llm/src/protocols/anthropic/types.rs
lib/llm/src/protocols/anthropic/types.rs
+44
-897
lib/llm/src/protocols/openai/chat_completions.rs
lib/llm/src/protocols/openai/chat_completions.rs
+16
-13
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
+107
-97
lib/llm/src/protocols/openai/chat_completions/delta.rs
lib/llm/src/protocols/openai/chat_completions/delta.rs
+25
-21
lib/llm/src/protocols/openai/chat_completions/jail.rs
lib/llm/src/protocols/openai/chat_completions/jail.rs
+19
-17
lib/llm/src/protocols/openai/completions.rs
lib/llm/src/protocols/openai/completions.rs
+3
-2
lib/llm/src/protocols/openai/completions/aggregator.rs
lib/llm/src/protocols/openai/completions/aggregator.rs
+8
-8
lib/llm/src/protocols/openai/completions/delta.rs
lib/llm/src/protocols/openai/completions/delta.rs
+3
-5
No files found.
lib/async-openai/src/types/anthropic.rs
0 → 100644
View file @
2887cd1c
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API types.
//!
//! Pure protocol types for the `/v1/messages` endpoint -- request, response,
//! streaming events, error shapes, and count-tokens types.
use
serde
::{
Deserialize
,
Serialize
};
use
utoipa
::
ToSchema
;
/// Anthropic-style cache control hint for prefix pinning with TTL.
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
Default,
PartialEq)]
pub
struct
CacheControl
{
#[serde(rename
=
"type"
)]
pub
control_type
:
CacheControlType
,
/// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
ttl
:
Option
<
String
>
,
}
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
Default,
PartialEq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
CacheControlType
{
#[default]
Ephemeral
,
#[serde(other)]
Unknown
,
}
const
MIN_TTL_SECONDS
:
u64
=
300
;
const
MAX_TTL_SECONDS
:
u64
=
3600
;
impl
CacheControl
{
/// Parse TTL string to seconds, clamped to [300, 3600].
///
/// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
/// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
/// Unrecognized strings default to 300s.
pub
fn
ttl_seconds
(
&
self
)
->
u64
{
let
raw
=
match
self
.ttl
.as_deref
()
{
None
=>
return
MIN_TTL_SECONDS
,
Some
(
"5m"
)
=>
300
,
Some
(
"1h"
)
=>
3600
,
Some
(
other
)
=>
match
other
.parse
::
<
u64
>
()
{
Ok
(
secs
)
=>
secs
,
Err
(
_
)
=>
{
tracing
::
warn!
(
"Unrecognized TTL '{}', defaulting to 300s"
,
other
);
return
MIN_TTL_SECONDS
;
}
},
};
raw
.clamp
(
MIN_TTL_SECONDS
,
MAX_TTL_SECONDS
)
}
}
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
SystemContent
{
/// The concatenated text from all system blocks (or the plain string).
pub
text
:
String
,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn
deserialize_system_prompt
<
'de
,
D
>
(
deserializer
:
D
)
->
Result
<
Option
<
SystemContent
>
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum
SystemPrompt
{
Text
(
String
),
Blocks
(
Vec
<
SystemBlock
>
),
}
#[derive(Deserialize)]
struct
SystemBlock
{
text
:
String
,
#[serde(default)]
cache_control
:
Option
<
CacheControl
>
,
}
let
maybe
:
Option
<
SystemPrompt
>
=
Option
::
deserialize
(
deserializer
)
?
;
Ok
(
maybe
.map
(|
sp
|
match
sp
{
SystemPrompt
::
Text
(
s
)
=>
SystemContent
{
text
:
s
,
cache_control
:
None
,
},
SystemPrompt
::
Blocks
(
blocks
)
=>
{
let
cache_control
=
blocks
.iter
()
.rev
()
.find_map
(|
b
|
b
.cache_control
.clone
());
let
text
=
blocks
.into_iter
()
.map
(|
b
|
b
.text
)
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
SystemContent
{
text
,
cache_control
,
}
}
}))
}
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicCreateMessageRequest
{
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub
model
:
String
,
/// The maximum number of tokens to generate.
pub
max_tokens
:
u32
,
/// The conversation messages.
pub
messages
:
Vec
<
AnthropicMessage
>
,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
temperature
:
Option
<
f32
>
,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_p
:
Option
<
f32
>
,
/// Top-K sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_k
:
Option
<
u32
>
,
/// Custom stop sequences.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequences
:
Option
<
Vec
<
String
>>
,
/// Whether to stream the response.
#[serde(default)]
pub
stream
:
bool
,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
metadata
:
Option
<
serde_json
::
Value
>
,
/// Tools the model may call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tool_choice
:
Option
<
AnthropicToolChoice
>
,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be >= 1024 and < max_tokens).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
thinking
:
Option
<
ThinkingConfig
>
,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
service_tier
:
Option
<
String
>
,
/// Container identifier for stateful sandbox sessions.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
container
:
Option
<
String
>
,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
output_config
:
Option
<
serde_json
::
Value
>
,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
ThinkingConfig
{
/// Either `"enabled"` or `"disabled"`.
#[serde(rename
=
"type"
)]
pub
thinking_type
:
String
,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
budget_tokens
:
Option
<
u32
>
,
}
/// A single message in the conversation.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessage
{
pub
role
:
AnthropicRole
,
#[serde(flatten)]
pub
content
:
AnthropicMessageContent
,
}
/// The role of a message sender.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicRole
{
User
,
Assistant
,
}
/// Message content -- either a plain string or an array of content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicMessageContent
{
/// Plain text content.
Text
{
content
:
String
},
/// Array of structured content blocks.
Blocks
{
content
:
Vec
<
AnthropicContentBlock
>
},
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug,
Clone,
Serialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicContentBlock
{
/// Text content block. May optionally include `citations` -- references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Image content block.
#[serde(rename
=
"image"
)]
Image
{
source
:
AnthropicImageSource
},
/// Tool use request from assistant.
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Tool result from user.
#[serde(rename
=
"tool_result"
)]
ToolResult
{
tool_use_id
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
content
:
Option
<
ToolResultContent
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
is_error
:
Option
<
bool
>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Content of a `tool_result` block -- either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContent
{
Text
(
String
),
Blocks
(
Vec
<
ToolResultContentBlock
>
),
}
impl
ToolResultContent
{
/// Extract the text content, concatenating array blocks if needed.
pub
fn
into_text
(
self
)
->
String
{
match
self
{
ToolResultContent
::
Text
(
s
)
=>
s
,
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.into_iter
()
.filter_map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
Some
(
text
),
ToolResultContentBlock
::
Other
(
_
)
=>
None
,
})
.collect
::
<
Vec
<
_
>>
()
.join
(
""
),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContentBlock
{
Text
{
text
:
String
,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other
(
serde_json
::
Value
),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl
<
'de
>
Deserialize
<
'de
>
for
AnthropicContentBlock
{
fn
deserialize
<
D
>
(
deserializer
:
D
)
->
Result
<
Self
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
let
value
=
serde_json
::
Value
::
deserialize
(
deserializer
)
?
;
let
block_type
=
value
.get
(
"type"
)
.and_then
(|
t
|
t
.as_str
())
.unwrap_or
(
""
)
.to_string
();
match
block_type
.as_str
()
{
"text"
=>
{
let
text
=
value
.get
(
"text"
)
.and_then
(|
t
|
t
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"text"
))
?
.to_string
();
let
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
=
value
.get
(
"citations"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Text
{
text
,
citations
,
cache_control
,
})
}
"image"
=>
{
let
source
:
AnthropicImageSource
=
serde_json
::
from_value
(
value
.get
(
"source"
)
.cloned
()
.unwrap_or_default
())
.map_err
(
serde
::
de
::
Error
::
custom
)
?
;
Ok
(
AnthropicContentBlock
::
Image
{
source
})
}
"tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
,
cache_control
,
})
}
"tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
:
Option
<
ToolResultContent
>
=
value
.get
(
"content"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
is_error
=
value
.get
(
"is_error"
)
.and_then
(|
v
|
v
.as_bool
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolResult
{
tool_use_id
,
content
,
is_error
,
cache_control
,
})
}
"thinking"
=>
{
let
thinking
=
value
.get
(
"thinking"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"thinking"
))
?
.to_string
();
let
signature
=
value
.get
(
"signature"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"signature"
))
?
.to_string
();
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Thinking
{
thinking
,
signature
,
cache_control
,
})
}
"redacted_thinking"
=>
{
let
data
=
value
.get
(
"data"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"data"
))
?
.to_string
();
Ok
(
AnthropicContentBlock
::
RedactedThinking
{
data
})
}
"server_tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
Ok
(
AnthropicContentBlock
::
ServerToolUse
{
id
,
name
,
input
})
}
"web_search_tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
=
value
.get
(
"content"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
([]));
Ok
(
AnthropicContentBlock
::
WebSearchToolResult
{
tool_use_id
,
content
,
})
}
other
=>
{
tracing
::
debug!
(
"Unrecognized Anthropic content block type '{}', preserving as Other"
,
other
);
Ok
(
AnthropicContentBlock
::
Other
(
value
))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicImageSource
{
#[serde(rename
=
"type"
)]
pub
source_type
:
String
,
pub
media_type
:
String
,
pub
data
:
String
,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicTool
{
/// Tool name (required for client tools, present on server tools too).
pub
name
:
String
,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default,
rename
=
"type"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
tool_type
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
description
:
Option
<
String
>
,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
input_schema
:
Option
<
serde_json
::
Value
>
,
/// Cache control breakpoint on this tool definition.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Tool choice specification.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicToolChoice
{
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named
(
AnthropicToolChoiceNamed
),
/// Simple mode: "auto", "any", or "none".
Simple
(
AnthropicToolChoiceSimple
),
}
/// Simple tool choice modes.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceSimple
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicToolChoiceMode
{
Auto
,
Any
,
None
,
Tool
,
}
/// Named tool choice.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceNamed
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
pub
name
:
String
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageResponse
{
pub
id
:
String
,
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
role
:
String
,
pub
content
:
Vec
<
AnthropicResponseContentBlock
>
,
pub
model
:
String
,
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
pub
stop_sequence
:
Option
<
String
>
,
pub
usage
:
AnthropicUsage
,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicResponseContentBlock
{
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
},
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
},
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Token usage information.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Default)]
pub
struct
AnthropicUsage
{
pub
input_tokens
:
u32
,
pub
output_tokens
:
u32
,
/// Number of input tokens used to create a new cache entry.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_creation_input_tokens
:
Option
<
u32
>
,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_read_input_tokens
:
Option
<
u32
>
,
}
/// Reason the model stopped generating.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
AnthropicStopReason
{
EndTurn
,
MaxTokens
,
StopSequence
,
ToolUse
,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn
,
/// The model refused to generate content (safety refusal).
Refusal
,
}
/// SSE event types for the Anthropic streaming API.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicStreamEvent
{
#[serde(rename
=
"message_start"
)]
MessageStart
{
message
:
AnthropicMessageResponse
},
#[serde(rename
=
"content_block_start"
)]
ContentBlockStart
{
index
:
u32
,
content_block
:
AnthropicResponseContentBlock
,
},
#[serde(rename
=
"content_block_delta"
)]
ContentBlockDelta
{
index
:
u32
,
delta
:
AnthropicDelta
},
#[serde(rename
=
"content_block_stop"
)]
ContentBlockStop
{
index
:
u32
},
#[serde(rename
=
"message_delta"
)]
MessageDelta
{
delta
:
AnthropicMessageDeltaBody
,
usage
:
AnthropicUsage
,
},
#[serde(rename
=
"message_stop"
)]
MessageStop
{},
#[serde(rename
=
"ping"
)]
Ping
{},
#[serde(rename
=
"error"
)]
Error
{
error
:
AnthropicErrorBody
},
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicDelta
{
#[serde(rename
=
"thinking_delta"
)]
ThinkingDelta
{
thinking
:
String
},
#[serde(rename
=
"text_delta"
)]
TextDelta
{
text
:
String
},
#[serde(rename
=
"input_json_delta"
)]
InputJsonDelta
{
partial_json
:
String
},
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename
=
"signature_delta"
)]
SignatureDelta
{
signature
:
String
},
/// Incremental citation attached to a text block.
#[serde(rename
=
"citations_delta"
)]
CitationsDelta
{
citation
:
serde_json
::
Value
},
}
/// The delta body in a message_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageDeltaBody
{
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequence
:
Option
<
String
>
,
}
/// Anthropic API error response wrapper.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorResponse
{
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
error
:
AnthropicErrorBody
,
}
/// Error body within an error response.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorBody
{
#[serde(rename
=
"type"
)]
pub
error_type
:
String
,
pub
message
:
String
,
}
impl
AnthropicErrorResponse
{
/// Create an `invalid_request_error` response.
pub
fn
invalid_request
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"invalid_request_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub
fn
api_error
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"api_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create a `not_found_error` response.
pub
fn
not_found
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"not_found_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
}
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Deserialize)]
pub
struct
AnthropicCountTokensRequest
{
pub
model
:
String
,
pub
messages
:
Vec
<
AnthropicMessage
>
,
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
#[serde(default)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Serialize)]
pub
struct
AnthropicCountTokensResponse
{
pub
input_tokens
:
u32
,
}
impl
AnthropicCountTokensRequest
{
/// Estimate input token count using a `len/3` heuristic.
pub
fn
estimate_tokens
(
&
self
)
->
u32
{
let
mut
total_len
:
usize
=
0
;
if
let
Some
(
system
)
=
&
self
.system
{
total_len
+=
system
.text
.len
();
}
for
msg
in
&
self
.messages
{
// Count role
total_len
+=
match
msg
.role
{
AnthropicRole
::
User
=>
4
,
AnthropicRole
::
Assistant
=>
9
,
};
// Count content
match
&
msg
.content
{
AnthropicMessageContent
::
Text
{
content
}
=>
total_len
+=
content
.len
(),
AnthropicMessageContent
::
Blocks
{
content
}
=>
{
for
block
in
content
{
total_len
+=
estimate_block_len
(
block
);
}
}
}
}
if
let
Some
(
tools
)
=
&
self
.tools
{
for
tool
in
tools
{
total_len
+=
tool
.name
.len
();
if
let
Some
(
desc
)
=
&
tool
.description
{
total_len
+=
desc
.len
();
}
if
let
Some
(
schema
)
=
&
tool
.input_schema
{
total_len
+=
schema
.to_string
()
.len
();
}
}
}
let
tokens
=
total_len
/
3
;
if
tokens
==
0
&&
total_len
>
0
{
1
}
else
{
tokens
as
u32
}
}
}
fn
estimate_block_len
(
block
:
&
AnthropicContentBlock
)
->
usize
{
match
block
{
AnthropicContentBlock
::
Text
{
text
,
..
}
=>
text
.len
(),
AnthropicContentBlock
::
ToolUse
{
name
,
input
,
..
}
=>
name
.len
()
+
input
.to_string
()
.len
(),
AnthropicContentBlock
::
ToolResult
{
content
,
..
}
=>
content
.as_ref
()
.map
(|
c
|
match
c
{
ToolResultContent
::
Text
(
s
)
=>
s
.len
(),
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.iter
()
.map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
text
.len
(),
ToolResultContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
})
.sum
(),
})
.unwrap_or
(
0
),
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
thinking
.len
(),
AnthropicContentBlock
::
RedactedThinking
{
data
,
..
}
=>
data
.len
(),
AnthropicContentBlock
::
ServerToolUse
{
name
,
input
,
..
}
=>
{
name
.len
()
+
input
.to_string
()
.len
()
}
AnthropicContentBlock
::
WebSearchToolResult
{
content
,
..
}
=>
content
.to_string
()
.len
(),
AnthropicContentBlock
::
Image
{
..
}
=>
256
,
// rough estimate for image metadata
AnthropicContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
}
}
lib/async-openai/src/types/chat.rs
View file @
2887cd1c
...
...
@@ -1182,10 +1182,6 @@ pub struct CreateChatCompletionResponse {
/// The object type, which is always `chat.completion`.
pub
object
:
String
,
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
...
...
@@ -1281,10 +1277,6 @@ pub struct CreateChatCompletionStreamResponse {
/// An optional field that will only be present when you set `stream_options: {"include_usage": true}` in your request.
/// When present, it contains a null value except for the last chunk which contains the token usage statistics for the entire request.
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
#[cfg(test)]
...
...
lib/async-openai/src/types/completion.rs
View file @
2887cd1c
...
...
@@ -224,10 +224,6 @@ pub struct CreateCompletionResponse {
/// The object type, which is always "text_completion"
pub
object
:
String
,
pub
usage
:
Option
<
CompletionUsage
>
,
/// NVIDIA extension field for response metadata (worker IDs, etc.)
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Parsed server side events stream until an \[DONE\] is received from server.
...
...
lib/async-openai/src/types/mod.rs
View file @
2887cd1c
...
...
@@ -10,6 +10,7 @@
//! Types used in OpenAI API requests and responses.
//! These types are created from component schemas in the [OpenAPI spec](https://github.com/openai/openai-openapi)
pub
mod
anthropic
;
mod
assistant
;
mod
assistant_impls
;
mod
assistant_stream
;
...
...
lib/llm/src/audit/stream.rs
View file @
2887cd1c
...
...
@@ -90,14 +90,16 @@ where
tracing
::
warn!
(
"audit: aggregation future canceled/failed"
);
// Return minimal response if aggregation failed
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
}
})
...
...
@@ -125,14 +127,16 @@ where
Err
(
e
)
=>
{
tracing
::
warn!
(
"fold aggregation failed: {e}"
);
let
fallback
=
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
};
let
_
=
tx
.send
(
fallback
.clone
());
...
...
@@ -145,14 +149,16 @@ where
rx
.await
.unwrap_or_else
(|
_
|
{
tracing
::
warn!
(
"fold aggregation future canceled"
);
NvCreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
String
::
new
(),
created
:
0
,
usage
:
None
,
model
:
String
::
new
(),
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
None
,
choices
:
vec!
[],
service_tier
:
None
,
},
nvext
:
None
,
}
})
...
...
@@ -171,8 +177,8 @@ pub fn final_response_to_one_chunk_stream(
)
->
std
::
pin
::
Pin
<
Box
<
dyn
futures
::
Stream
<
Item
=
Annotated
<
NvCreateChatCompletionStreamResponse
>>
+
Send
>
,
>
{
let
mut
choices
:
Vec
<
ChatChoiceStream
>
=
Vec
::
with_capacity
(
resp
.choices
.len
());
for
(
idx
,
ch
)
in
resp
.choices
.iter
()
.enumerate
()
{
let
mut
choices
:
Vec
<
ChatChoiceStream
>
=
Vec
::
with_capacity
(
resp
.
inner.
choices
.len
());
for
(
idx
,
ch
)
in
resp
.
inner.
choices
.iter
()
.enumerate
()
{
// Convert FunctionCall to FunctionCallStream if present
#[allow(deprecated)]
let
function_call
=
ch
.message.function_call
.as_ref
()
.map
(|
fc
|
{
...
...
@@ -222,14 +228,16 @@ pub fn final_response_to_one_chunk_stream(
}
let
chunk
=
NvCreateChatCompletionStreamResponse
{
id
:
resp
.id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
:
resp
.created
,
model
:
resp
.model
.clone
(),
system_fingerprint
:
resp
.system_fingerprint
.clone
(),
service_tier
:
resp
.service_tier
.clone
(),
choices
,
usage
:
resp
.usage
.clone
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
resp
.inner.id
.clone
(),
object
:
"chat.completion.chunk"
.to_string
(),
created
:
resp
.inner.created
,
model
:
resp
.inner.model
.clone
(),
system_fingerprint
:
resp
.inner.system_fingerprint
.clone
(),
service_tier
:
resp
.inner.service_tier
.clone
(),
choices
,
usage
:
resp
.inner.usage
.clone
(),
},
nvext
:
resp
.nvext
.clone
(),
};
...
...
@@ -275,14 +283,16 @@ mod tests {
};
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
...
...
@@ -314,14 +324,16 @@ mod tests {
};
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[
choice
],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
Some
(
"test-fingerprint"
.to_string
()),
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
...
...
@@ -339,7 +351,7 @@ mod tests {
chunk
.data
.as_ref
()
.and_then
(|
d
|
d
.choices
.first
())
.and_then
(|
d
|
d
.
inner.
choices
.first
())
.and_then
(|
c
|
c
.delta.content
.as_ref
())
.and_then
(|
content
|
match
content
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
Some
(
text
.clone
()),
...
...
@@ -396,7 +408,7 @@ mod tests {
assert_eq!
(
results
.len
(),
0
,
"Empty stream should produce no chunks"
);
// Verify fallback response (aggregation will fail on empty stream)
assert_eq!
(
final_resp
.object
,
"chat.completion"
);
assert_eq!
(
final_resp
.
inner.
object
,
"chat.completion"
);
// Should get fallback response, not panic
}
...
...
@@ -415,7 +427,7 @@ mod tests {
assert_eq!
(
extract_content
(
&
results
[
0
]),
"Single chunk"
);
// Verify aggregation
assert_eq!
(
final_resp
.object
,
"chat.completion"
);
assert_eq!
(
final_resp
.
inner.
object
,
"chat.completion"
);
}
#[tokio::test]
...
...
@@ -423,32 +435,34 @@ mod tests {
// Test that metadata (id, event, comment) is preserved through passthrough
let
chunk_with_metadata
=
Annotated
{
data
:
Some
(
NvCreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[{
#[allow(deprecated)]
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
role
:
Some
(
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Content"
.to_string
(),
)),
tool_calls
:
None
,
function_call
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[{
#[allow(deprecated)]
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
role
:
Some
(
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Content"
.to_string
(),
)),
tool_calls
:
None
,
function_call
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
}),
id
:
Some
(
"correlation-123"
.to_string
()),
...
...
@@ -481,7 +495,7 @@ mod tests {
let
(
resp1
,
resp2
)
=
tokio
::
join!
(
future1
,
future2
);
// Both should complete successfully
assert_eq!
(
resp1
.object
,
"chat.completion"
);
assert_eq!
(
resp2
.object
,
"chat.completion"
);
assert_eq!
(
resp1
.
inner.
object
,
"chat.completion"
);
assert_eq!
(
resp2
.
inner.
object
,
"chat.completion"
);
}
}
lib/llm/src/entrypoint/input/batch.rs
View file @
2887cd1c
...
...
@@ -238,8 +238,9 @@ async fn evaluate(
match
(
item
.data
.as_ref
(),
item
.event
.as_deref
())
{
(
Some
(
data
),
_
)
=>
{
// Normal case
let
choice
=
data
.choices
.first
();
let
chat_comp
=
choice
.as_ref
()
.unwrap
();
let
Some
(
chat_comp
)
=
data
.inner.choices
.first
()
else
{
continue
;
};
if
let
Some
(
c
)
=
&
chat_comp
.delta.content
{
match
c
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
{
...
...
lib/llm/src/entrypoint/input/text.rs
View file @
2887cd1c
...
...
@@ -138,8 +138,9 @@ async fn main_loop(
match
(
item
.data
.as_ref
(),
item
.event
.as_deref
())
{
(
Some
(
data
),
_
)
=>
{
// Normal case
let
entry
=
data
.choices
.first
();
let
chat_comp
=
entry
.as_ref
()
.unwrap
();
let
Some
(
chat_comp
)
=
data
.inner.choices
.first
()
else
{
continue
;
};
if
let
Some
(
c
)
=
&
chat_comp
.delta.content
{
match
c
{
ChatCompletionMessageContent
::
Text
(
text
)
=>
{
...
...
lib/llm/src/http/service/openai.rs
View file @
2887cd1c
...
...
@@ -991,7 +991,7 @@ fn streaming_tool_dispatch_events(
};
let
mut
events
=
vec!
[];
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
let
Some
(
tool_calls
)
=
&
choice
.delta.tool_calls
else
{
continue
;
};
...
...
@@ -1034,7 +1034,7 @@ fn accumulate_reasoning_dispatch(
};
let
mut
events
=
vec!
[];
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
let
buffer
=
buffers
.entry
(
choice
.index
)
.or_default
();
let
has_reasoning
=
choice
.delta
...
...
@@ -2892,15 +2892,17 @@ mod tests {
// Create a normal data event
let
normal_event
=
Annotated
::
<
NvCreateChatCompletionStreamResponse
>
{
data
:
Some
(
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[],
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
service_tier
:
None
,
usage
:
None
,
data
:
Some
(
NvCreateChatCompletionStreamResponse
{
inner
:
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
:
vec!
[],
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
service_tier
:
None
,
usage
:
None
,
},
nvext
:
None
,
}),
id
:
Some
(
"msg-1"
.to_string
()),
...
...
@@ -3162,15 +3164,17 @@ mod tests {
fn
make_stream_response
(
choices
:
Vec
<
ChatChoiceStream
>
,
)
->
Annotated
<
NvCreateChatCompletionStreamResponse
>
{
let
response
=
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
,
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
let
response
=
NvCreateChatCompletionStreamResponse
{
inner
:
CreateChatCompletionStreamResponse
{
id
:
"test-id"
.to_string
(),
choices
,
created
:
0
,
model
:
"test-model"
.to_string
(),
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
service_tier
:
None
,
},
nvext
:
None
,
};
Annotated
{
...
...
lib/llm/src/perf/logprobs.rs
View file @
2887cd1c
...
...
@@ -128,7 +128,7 @@ impl LogprobExtractor for NvCreateChatCompletionStreamResponse {
fn
extract_logprobs_by_choice
(
&
self
)
->
HashMap
<
u32
,
Vec
<
TokenLogProbs
>>
{
let
mut
result
=
HashMap
::
new
();
for
choice
in
&
self
.choices
{
for
choice
in
&
self
.
inner.
choices
{
let
choice_index
=
choice
.index
;
let
choice_logprobs
=
choice
...
...
@@ -949,34 +949,36 @@ mod tests {
)
->
NvCreateChatCompletionStreamResponse
{
#[expect(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
),
),
)
,
function
_call
:
None
,
tool_calls
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
Some
(
ChatChoiceL
ogprobs
{
content
:
Some
(
token_logprobs
)
,
refusal
:
None
,
}
)
,
}]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
function_call
:
None
,
tool
_call
s
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
Some
(
ChatChoiceLogprobs
{
content
:
Some
(
token_l
ogprobs
),
refusal
:
None
,
})
,
}
]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
}
,
nvext
:
None
,
}
}
...
...
@@ -1012,14 +1014,16 @@ mod tests {
.collect
();
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
,
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
,
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -1341,31 +1345,33 @@ mod tests {
// Test with choice that has no logprobs
#[expect(deprecated)]
let
response
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"test"
.to_string
(),
),
),
)
,
function
_call
:
None
,
tool_calls
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
None
,
// No logprobs
}]
,
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
function_call
:
None
,
tool
_call
s
:
None
,
role
:
Some
(
Role
::
Assistant
)
,
refusal
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
None
,
// No logprobs
}],
created
:
1234567890
,
model
:
"test-model"
.to_string
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
()
,
usage
:
None
,
}
,
nvext
:
None
,
};
...
...
@@ -1573,14 +1579,16 @@ mod tests {
// In practice, this would have real logprobs data
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
choices
:
vec!
[],
created
:
1234567890
,
model
:
"test-model"
.to_string
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.to_string
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
lib/llm/src/preprocessor.rs
View file @
2887cd1c
...
...
@@ -1217,7 +1217,7 @@ impl OpenAIPreprocessor {
let
processed_response
=
if
let
Some
(
ref
mut
parser
)
=
state
.reasoning_parser
{
response
.map_data
(|
mut
data
|
{
// Process all choices, not just the first one
for
choice
in
data
.choices
.iter_mut
()
{
for
choice
in
data
.
inner.
choices
.iter_mut
()
{
// Reasoning parsing only applies to text content
if
let
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
...
...
lib/llm/src/preprocessor/speculative_prefill.rs
View file @
2887cd1c
...
...
@@ -111,7 +111,7 @@ pub fn maybe_wrap_stream(
let
mut
prefill_tx
=
Some
(
tx
);
Box
::
pin
(
stream
.map
(
move
|
item
|
{
if
let
Some
(
ref
resp
)
=
item
.data
{
for
choice
in
&
resp
.choices
{
for
choice
in
&
resp
.
inner.
choices
{
if
let
Some
(
ChatCompletionMessageContent
::
Text
(
ref
text
))
=
choice
.delta.content
{
accumulated_text
.push_str
(
text
);
}
...
...
lib/llm/src/protocols/anthropic/stream_converter.rs
View file @
2887cd1c
...
...
@@ -106,7 +106,7 @@ impl AnthropicStreamConverter {
let
mut
events
=
Vec
::
new
();
// Capture real token usage from engine when available (typically on the final chunk).
if
let
Some
(
usage
)
=
&
chunk
.usage
{
if
let
Some
(
usage
)
=
&
chunk
.
inner.
usage
{
self
.input_token_count
=
usage
.prompt_tokens
;
self
.output_token_count
=
usage
.completion_tokens
;
self
.cached_token_count
=
usage
...
...
@@ -115,7 +115,7 @@ impl AnthropicStreamConverter {
.and_then
(|
d
|
d
.cached_tokens
);
}
for
choice
in
&
chunk
.choices
{
for
choice
in
&
chunk
.
inner.
choices
{
let
delta
=
&
choice
.delta
;
// Track finish reason
...
...
@@ -444,7 +444,7 @@ impl AnthropicStreamConverter {
)
->
Vec
<
TaggedEvent
>
{
let
mut
events
=
Vec
::
new
();
if
let
Some
(
usage
)
=
&
chunk
.usage
{
if
let
Some
(
usage
)
=
&
chunk
.
inner.
usage
{
self
.input_token_count
=
usage
.prompt_tokens
;
self
.output_token_count
=
usage
.completion_tokens
;
self
.cached_token_count
=
usage
...
...
@@ -453,7 +453,7 @@ impl AnthropicStreamConverter {
.and_then
(|
d
|
d
.cached_tokens
);
}
for
choice
in
&
chunk
.choices
{
for
choice
in
&
chunk
.
inner.
choices
{
let
delta
=
&
choice
.delta
;
if
let
Some
(
ref
fr
)
=
choice
.finish_reason
{
...
...
@@ -722,27 +722,29 @@ mod tests {
fn
text_chunk
(
text
:
&
str
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
text
.into
())),
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
text
.into
())),
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -755,35 +757,37 @@ mod tests {
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
Some
(
vec!
[
ChatCompletionMessageToolCallChunk
{
index
:
tc_index
,
id
:
id
.map
(
String
::
from
),
r
#
type
:
Some
(
ChatCompletionToolType
::
Function
),
function
:
Some
(
FunctionCallStream
{
name
:
name
.map
(
String
::
from
),
arguments
:
args
.map
(
String
::
from
),
}),
}]),
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
Some
(
vec!
[
ChatCompletionMessageToolCallChunk
{
index
:
tc_index
,
id
:
id
.map
(
String
::
from
),
r
#
type
:
Some
(
ChatCompletionToolType
::
Function
),
function
:
Some
(
FunctionCallStream
{
name
:
name
.map
(
String
::
from
),
arguments
:
args
.map
(
String
::
from
),
}),
}]),
role
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
@@ -908,27 +912,29 @@ mod tests {
fn
reasoning_chunk
(
text
:
&
str
)
->
NvCreateChatCompletionStreamResponse
{
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
Some
(
text
.into
()),
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"chat-1"
.into
(),
choices
:
vec!
[
ChatChoiceStream
{
index
:
0
,
delta
:
ChatCompletionStreamResponseDelta
{
content
:
None
,
function_call
:
None
,
tool_calls
:
None
,
role
:
None
,
refusal
:
None
,
reasoning_content
:
Some
(
text
.into
()),
},
finish_reason
:
None
,
stop_reason
:
None
,
logprobs
:
None
,
}],
created
:
0
,
model
:
"test"
.into
(),
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion.chunk"
.into
(),
usage
:
None
,
},
nvext
:
None
,
}
}
...
...
lib/llm/src/protocols/anthropic/types.rs
View file @
2887cd1c
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API
types and
conversion logic.
//! Anthropic Messages API conversion logic.
//!
//! All request/response types for the `/v1/messages` endpoint, plus
//! bidirectional conversion to/from the internal chat completions format.
//! Pure protocol types live in `dynamo_async_openai::types::anthropic`.
//! This module provides bidirectional conversion to/from the internal
//! chat completions format used by the Dynamo engine.
// Re-export all pure Anthropic protocol types so existing `use crate::protocols::anthropic::*`
// continues to work throughout dynamo-llm.
pub
use
dynamo_async_openai
::
types
::
anthropic
::
*
;
use
dynamo_async_openai
::
types
::{
ChatCompletionMessageToolCall
,
ChatCompletionNamedToolChoice
,
...
...
@@ -17,764 +22,13 @@ use dynamo_async_openai::types::{
ChatCompletionTool
,
ChatCompletionToolChoiceOption
,
ChatCompletionToolType
,
FunctionName
,
FunctionObject
,
ImageUrl
,
ReasoningContent
,
};
use
serde
::{
Deserialize
,
Serialize
};
use
uuid
::
Uuid
;
use
crate
::
protocols
::
openai
::
chat_completions
::{
NvCreateChatCompletionRequest
,
NvCreateChatCompletionResponse
,
};
use
crate
::
protocols
::
openai
::
common_ext
::
CommonExt
;
use
crate
::
protocols
::
openai
::
nvext
::{
CacheControl
,
NvExt
};
// ---------------------------------------------------------------------------
// Custom deserializers
// ---------------------------------------------------------------------------
/// Parsed system prompt content, preserving cache_control from block arrays.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
SystemContent
{
/// The concatenated text from all system blocks (or the plain string).
pub
text
:
String
,
/// Cache control from the last system block that had one.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Deserialize `system` from either a plain string or an array of text blocks.
/// The Anthropic API accepts both `"system": "text"` and
/// `"system": [{"type": "text", "text": "...", "cache_control": {...}}]`.
fn
deserialize_system_prompt
<
'de
,
D
>
(
deserializer
:
D
)
->
Result
<
Option
<
SystemContent
>
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum
SystemPrompt
{
Text
(
String
),
Blocks
(
Vec
<
SystemBlock
>
),
}
#[derive(Deserialize)]
struct
SystemBlock
{
text
:
String
,
#[serde(default)]
cache_control
:
Option
<
CacheControl
>
,
}
let
maybe
:
Option
<
SystemPrompt
>
=
Option
::
deserialize
(
deserializer
)
?
;
Ok
(
maybe
.map
(|
sp
|
match
sp
{
SystemPrompt
::
Text
(
s
)
=>
SystemContent
{
text
:
s
,
cache_control
:
None
,
},
SystemPrompt
::
Blocks
(
blocks
)
=>
{
let
cache_control
=
blocks
.iter
()
.rev
()
.find_map
(|
b
|
b
.cache_control
.clone
());
let
text
=
blocks
.into_iter
()
.map
(|
b
|
b
.text
)
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
SystemContent
{
text
,
cache_control
,
}
}
}))
}
// ---------------------------------------------------------------------------
// Request types
// ---------------------------------------------------------------------------
/// Top-level request body for `POST /v1/messages`.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicCreateMessageRequest
{
/// The model to use (e.g. "claude-sonnet-4-20250514").
pub
model
:
String
,
/// The maximum number of tokens to generate.
pub
max_tokens
:
u32
,
/// The conversation messages.
pub
messages
:
Vec
<
AnthropicMessage
>
,
/// Optional system prompt (string or array of `{"type":"text","text":"..."}` blocks).
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
/// Sampling temperature (0.0 - 1.0).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
temperature
:
Option
<
f32
>
,
/// Nucleus sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_p
:
Option
<
f32
>
,
/// Top-K sampling parameter.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
top_k
:
Option
<
u32
>
,
/// Custom stop sequences.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequences
:
Option
<
Vec
<
String
>>
,
/// Whether to stream the response.
#[serde(default)]
pub
stream
:
bool
,
/// Optional metadata (e.g. user_id).
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
metadata
:
Option
<
serde_json
::
Value
>
,
/// Tools the model may call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
/// How the model should choose which tool to call.
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
tool_choice
:
Option
<
AnthropicToolChoice
>
,
/// Top-level cache control for automatic prompt prefix caching.
/// When present, the system caches all content up to the last cacheable block.
/// Matches the Anthropic Messages API automatic caching mode.
/// See: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#automatic-caching
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
/// Extended thinking configuration. When enabled, the model produces
/// `thinking` content blocks containing its internal reasoning before
/// the final response. The `budget_tokens` field controls how many tokens
/// the model may use for thinking (must be ≥ 1024 and < max_tokens).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
thinking
:
Option
<
ThinkingConfig
>
,
/// Service tier selection: `"auto"` or `"standard_only"`.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
service_tier
:
Option
<
String
>
,
/// Container identifier for stateful sandbox sessions.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
container
:
Option
<
String
>
,
/// Output configuration: effort level and optional JSON schema format.
/// `effort` can be `"low"`, `"medium"`, `"high"`, or `"max"`.
/// `format` specifies structured JSON output constraints.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
output_config
:
Option
<
serde_json
::
Value
>
,
}
/// Extended thinking configuration for the request.
///
/// When `type` is `"enabled"`, the model will produce `thinking` content blocks
/// with its internal reasoning. `budget_tokens` controls the maximum tokens
/// available for thinking (minimum 1024, must be less than `max_tokens`).
/// When `type` is `"disabled"`, no thinking blocks are produced.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
ThinkingConfig
{
/// Either `"enabled"` or `"disabled"`.
#[serde(rename
=
"type"
)]
pub
thinking_type
:
String
,
/// Maximum tokens for internal reasoning. Only relevant when type is "enabled".
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
budget_tokens
:
Option
<
u32
>
,
}
/// A single message in the conversation.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessage
{
pub
role
:
AnthropicRole
,
#[serde(flatten)]
pub
content
:
AnthropicMessageContent
,
}
/// The role of a message sender.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicRole
{
User
,
Assistant
,
}
/// Message content — either a plain string or an array of content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicMessageContent
{
/// Plain text content.
Text
{
content
:
String
},
/// Array of structured content blocks.
Blocks
{
content
:
Vec
<
AnthropicContentBlock
>
},
}
/// A single content block within a message.
///
/// Uses a custom deserializer so that unknown block types (e.g. `citations`,
/// `server_tool_use`, `redacted_thinking`) are captured as `Other(Value)` instead
/// of causing a hard deserialization failure. This is important because Claude
/// Code may send block types that we don't yet handle.
#[derive(Debug,
Clone,
Serialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicContentBlock
{
/// Text content block. May optionally include `citations` — references to
/// source documents that support the text content. Citations are generated
/// by the model when document/PDF content is provided and citation mode is enabled.
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Image content block.
#[serde(rename
=
"image"
)]
Image
{
source
:
AnthropicImageSource
},
/// Tool use request from assistant.
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Tool result from user.
#[serde(rename
=
"tool_result"
)]
ToolResult
{
tool_use_id
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
content
:
Option
<
ToolResultContent
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
is_error
:
Option
<
bool
>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Thinking content block from assistant (extended thinking / reasoning).
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
cache_control
:
Option
<
CacheControl
>
,
},
/// Redacted thinking block from assistant. Contains encrypted reasoning data
/// that is opaque to the client but must be passed back verbatim in multi-turn
/// conversations so the model can maintain its chain of thought.
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
/// Server-initiated tool use block. Represents a tool call that the API
/// executes server-side (e.g., web search). The client receives the result
/// via a corresponding `web_search_tool_result` or similar block.
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
/// Result from a server-initiated tool (e.g., web search results).
/// Contains structured content returned by the server-side tool execution.
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for unrecognized block types. Preserves the full JSON value
/// so that new Anthropic features don't break the endpoint and can be
/// round-tripped or inspected.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Content of a `tool_result` block — either a plain string or an array of
/// content blocks (the Anthropic API accepts both).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContent
{
Text
(
String
),
Blocks
(
Vec
<
ToolResultContentBlock
>
),
}
impl
ToolResultContent
{
/// Extract the text content, concatenating array blocks if needed.
pub
fn
into_text
(
self
)
->
String
{
match
self
{
ToolResultContent
::
Text
(
s
)
=>
s
,
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.into_iter
()
.filter_map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
Some
(
text
),
ToolResultContentBlock
::
Other
(
_
)
=>
None
,
})
.collect
::
<
Vec
<
_
>>
()
.join
(
""
),
}
}
}
/// A content block within a `tool_result.content` array.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
ToolResultContentBlock
{
Text
{
text
:
String
,
},
/// Catch-all for non-text blocks (images, etc.) in tool results.
Other
(
serde_json
::
Value
),
}
/// Custom deserializer for `AnthropicContentBlock` that handles unknown types
/// gracefully. Since serde's `#[serde(other)]` is not supported on internally
/// tagged enums, we deserialize as `Value` first and dispatch manually.
impl
<
'de
>
Deserialize
<
'de
>
for
AnthropicContentBlock
{
fn
deserialize
<
D
>
(
deserializer
:
D
)
->
Result
<
Self
,
D
::
Error
>
where
D
:
serde
::
Deserializer
<
'de
>
,
{
let
value
=
serde_json
::
Value
::
deserialize
(
deserializer
)
?
;
let
block_type
=
value
.get
(
"type"
)
.and_then
(|
t
|
t
.as_str
())
.unwrap_or
(
""
)
.to_string
();
match
block_type
.as_str
()
{
"text"
=>
{
let
text
=
value
.get
(
"text"
)
.and_then
(|
t
|
t
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"text"
))
?
.to_string
();
let
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
=
value
.get
(
"citations"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Text
{
text
,
citations
,
cache_control
,
})
}
"image"
=>
{
let
source
:
AnthropicImageSource
=
serde_json
::
from_value
(
value
.get
(
"source"
)
.cloned
()
.unwrap_or_default
())
.map_err
(
serde
::
de
::
Error
::
custom
)
?
;
Ok
(
AnthropicContentBlock
::
Image
{
source
})
}
"tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolUse
{
id
,
name
,
input
,
cache_control
,
})
}
"tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
:
Option
<
ToolResultContent
>
=
value
.get
(
"content"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
let
is_error
=
value
.get
(
"is_error"
)
.and_then
(|
v
|
v
.as_bool
());
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
ToolResult
{
tool_use_id
,
content
,
is_error
,
cache_control
,
})
}
"thinking"
=>
{
let
thinking
=
value
.get
(
"thinking"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"thinking"
))
?
.to_string
();
let
signature
=
value
.get
(
"signature"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"signature"
))
?
.to_string
();
let
cache_control
:
Option
<
CacheControl
>
=
value
.get
(
"cache_control"
)
.cloned
()
.and_then
(|
v
|
serde_json
::
from_value
(
v
)
.ok
());
Ok
(
AnthropicContentBlock
::
Thinking
{
thinking
,
signature
,
cache_control
,
})
}
"redacted_thinking"
=>
{
let
data
=
value
.get
(
"data"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"data"
))
?
.to_string
();
Ok
(
AnthropicContentBlock
::
RedactedThinking
{
data
})
}
"server_tool_use"
=>
{
let
id
=
value
.get
(
"id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"id"
))
?
.to_string
();
let
name
=
value
.get
(
"name"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"name"
))
?
.to_string
();
let
input
=
value
.get
(
"input"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
({}));
Ok
(
AnthropicContentBlock
::
ServerToolUse
{
id
,
name
,
input
})
}
"web_search_tool_result"
=>
{
let
tool_use_id
=
value
.get
(
"tool_use_id"
)
.and_then
(|
v
|
v
.as_str
())
.ok_or_else
(||
serde
::
de
::
Error
::
missing_field
(
"tool_use_id"
))
?
.to_string
();
let
content
=
value
.get
(
"content"
)
.cloned
()
.unwrap_or
(
serde_json
::
json!
([]));
Ok
(
AnthropicContentBlock
::
WebSearchToolResult
{
tool_use_id
,
content
,
})
}
other
=>
{
tracing
::
debug!
(
"Unrecognized Anthropic content block type '{}', preserving as Other"
,
other
);
Ok
(
AnthropicContentBlock
::
Other
(
value
))
}
}
}
}
/// Image source for image content blocks.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicImageSource
{
#[serde(rename
=
"type"
)]
pub
source_type
:
String
,
pub
media_type
:
String
,
pub
data
:
String
,
}
/// A tool definition.
///
/// Client tools (custom) require `name` + `input_schema`. Server tools
/// (web_search, bash, text_editor, code_execution, etc.) are discriminated
/// by their `type` field (e.g. `"web_search_20260209"`) and may not have
/// `input_schema`. We keep all fields optional beyond `name` so both
/// kinds deserialize successfully and pass through to the backend.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicTool
{
/// Tool name (required for client tools, present on server tools too).
pub
name
:
String
,
/// Tool type discriminator. Client tools use `"custom"` (or omit).
/// Server tools use versioned types like `"web_search_20260209"`.
#[serde(default,
rename
=
"type"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
tool_type
:
Option
<
String
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
description
:
Option
<
String
>
,
/// JSON Schema for the tool input. Required for client tools, absent on
/// server tools (which define their own input shape server-side).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
input_schema
:
Option
<
serde_json
::
Value
>
,
/// Cache control breakpoint on this tool definition.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_control
:
Option
<
CacheControl
>
,
}
/// Tool choice specification.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(untagged)]
pub
enum
AnthropicToolChoice
{
/// Named tool: `{type: "tool", name: "..."}`
/// Must be listed before Simple so serde tries the stricter shape first.
Named
(
AnthropicToolChoiceNamed
),
/// Simple mode: "auto", "any", or "none".
Simple
(
AnthropicToolChoiceSimple
),
}
/// Simple tool choice modes.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceSimple
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"lowercase"
)]
pub
enum
AnthropicToolChoiceMode
{
Auto
,
Any
,
None
,
Tool
,
}
/// Named tool choice.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicToolChoiceNamed
{
#[serde(rename
=
"type"
)]
pub
choice_type
:
AnthropicToolChoiceMode
,
pub
name
:
String
,
/// When true, the model will call tools one at a time instead of
/// potentially issuing multiple tool calls in a single response.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
disable_parallel_tool_use
:
Option
<
bool
>
,
}
// ---------------------------------------------------------------------------
// Response types
// ---------------------------------------------------------------------------
/// Response body for `POST /v1/messages` (non-streaming).
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageResponse
{
pub
id
:
String
,
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
role
:
String
,
pub
content
:
Vec
<
AnthropicResponseContentBlock
>
,
pub
model
:
String
,
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
pub
stop_sequence
:
Option
<
String
>
,
pub
usage
:
AnthropicUsage
,
}
/// A content block in the response.
///
/// The Anthropic API returns up to 12 different block types. We model the
/// common ones explicitly and catch the rest as `Other` so the proxy can
/// forward them without losing data.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicResponseContentBlock
{
#[serde(rename
=
"thinking"
)]
Thinking
{
thinking
:
String
,
signature
:
String
},
#[serde(rename
=
"text"
)]
Text
{
text
:
String
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
citations
:
Option
<
Vec
<
serde_json
::
Value
>>
,
},
#[serde(rename
=
"tool_use"
)]
ToolUse
{
id
:
String
,
name
:
String
,
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"redacted_thinking"
)]
RedactedThinking
{
data
:
String
},
#[serde(rename
=
"server_tool_use"
)]
ServerToolUse
{
id
:
String
,
name
:
String
,
#[serde(default)]
input
:
serde_json
::
Value
,
},
#[serde(rename
=
"web_search_tool_result"
)]
WebSearchToolResult
{
tool_use_id
:
String
,
#[serde(default)]
content
:
serde_json
::
Value
,
},
/// Catch-all for new/uncommon block types (web_fetch_tool_result,
/// code_execution_tool_result, container_upload, etc.) so the proxy
/// can serialize them back without data loss.
#[serde(untagged)]
Other
(
serde_json
::
Value
),
}
/// Token usage information.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Default)]
pub
struct
AnthropicUsage
{
pub
input_tokens
:
u32
,
pub
output_tokens
:
u32
,
/// Number of input tokens used to create a new cache entry.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_creation_input_tokens
:
Option
<
u32
>
,
/// Number of input tokens read from the prompt cache (prefix cache hits).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
cache_read_input_tokens
:
Option
<
u32
>
,
}
/// Reason the model stopped generating.
#[derive(Debug,
Clone,
Serialize,
Deserialize,
PartialEq,
Eq)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
AnthropicStopReason
{
EndTurn
,
MaxTokens
,
StopSequence
,
ToolUse
,
/// The model paused to yield control in an agentic loop, intending to
/// continue in a subsequent turn. Used with extended thinking / tool use.
PauseTurn
,
/// The model refused to generate content (safety refusal).
Refusal
,
}
// ---------------------------------------------------------------------------
// Streaming types
// ---------------------------------------------------------------------------
/// SSE event types for the Anthropic streaming API.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicStreamEvent
{
#[serde(rename
=
"message_start"
)]
MessageStart
{
message
:
AnthropicMessageResponse
},
#[serde(rename
=
"content_block_start"
)]
ContentBlockStart
{
index
:
u32
,
content_block
:
AnthropicResponseContentBlock
,
},
#[serde(rename
=
"content_block_delta"
)]
ContentBlockDelta
{
index
:
u32
,
delta
:
AnthropicDelta
},
#[serde(rename
=
"content_block_stop"
)]
ContentBlockStop
{
index
:
u32
},
#[serde(rename
=
"message_delta"
)]
MessageDelta
{
delta
:
AnthropicMessageDeltaBody
,
usage
:
AnthropicUsage
,
},
#[serde(rename
=
"message_stop"
)]
MessageStop
{},
#[serde(rename
=
"ping"
)]
Ping
{},
#[serde(rename
=
"error"
)]
Error
{
error
:
AnthropicErrorBody
},
}
/// Delta content in a streaming content_block_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
#[serde(tag
=
"type"
)]
pub
enum
AnthropicDelta
{
#[serde(rename
=
"thinking_delta"
)]
ThinkingDelta
{
thinking
:
String
},
#[serde(rename
=
"text_delta"
)]
TextDelta
{
text
:
String
},
#[serde(rename
=
"input_json_delta"
)]
InputJsonDelta
{
partial_json
:
String
},
/// Incremental signature for a thinking block (sent at the end).
#[serde(rename
=
"signature_delta"
)]
SignatureDelta
{
signature
:
String
},
/// Incremental citation attached to a text block.
#[serde(rename
=
"citations_delta"
)]
CitationsDelta
{
citation
:
serde_json
::
Value
},
}
/// The delta body in a message_delta event.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicMessageDeltaBody
{
pub
stop_reason
:
Option
<
AnthropicStopReason
>
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
stop_sequence
:
Option
<
String
>
,
}
// ---------------------------------------------------------------------------
// Error types
// ---------------------------------------------------------------------------
/// Anthropic API error response wrapper.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorResponse
{
#[serde(rename
=
"type"
)]
pub
object_type
:
String
,
pub
error
:
AnthropicErrorBody
,
}
/// Error body within an error response.
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
AnthropicErrorBody
{
#[serde(rename
=
"type"
)]
pub
error_type
:
String
,
pub
message
:
String
,
}
impl
AnthropicErrorResponse
{
/// Create an `invalid_request_error` response.
pub
fn
invalid_request
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"invalid_request_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create an `api_error` (internal server error) response.
pub
fn
api_error
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"api_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
/// Create a `not_found_error` response.
pub
fn
not_found
(
message
:
impl
Into
<
String
>
)
->
Self
{
Self
{
object_type
:
"error"
.to_string
(),
error
:
AnthropicErrorBody
{
error_type
:
"not_found_error"
.to_string
(),
message
:
message
.into
(),
},
}
}
}
// ---------------------------------------------------------------------------
// Conversion: AnthropicCreateMessageRequest -> NvCreateChatCompletionRequest
// ---------------------------------------------------------------------------
use
crate
::
protocols
::
openai
::
nvext
::
NvExt
;
impl
TryFrom
<
AnthropicCreateMessageRequest
>
for
NvCreateChatCompletionRequest
{
type
Error
=
anyhow
::
Error
;
...
...
@@ -1199,11 +453,6 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
}
}
}
// ---------------------------------------------------------------------------
// Conversion: NvCreateChatCompletionResponse -> AnthropicMessageResponse
// ---------------------------------------------------------------------------
/// Convert a completed chat completion response into an Anthropic Messages response.
pub
fn
chat_completion_to_anthropic_response
(
chat_resp
:
NvCreateChatCompletionResponse
,
...
...
@@ -1211,7 +460,7 @@ pub fn chat_completion_to_anthropic_response(
)
->
AnthropicMessageResponse
{
let
msg_id
=
format!
(
"msg_{}"
,
Uuid
::
new_v4
()
.simple
());
let
choice
=
chat_resp
.choices
.into_iter
()
.next
();
let
choice
=
chat_resp
.
inner.
choices
.into_iter
()
.next
();
let
mut
content
=
Vec
::
new
();
let
mut
stop_reason
=
None
;
...
...
@@ -1282,6 +531,7 @@ pub fn chat_completion_to_anthropic_response(
// Map usage
let
usage
=
chat_resp
.inner
.usage
.map
(|
u
|
{
let
cache_read_input_tokens
=
u
...
...
@@ -1308,111 +558,6 @@ pub fn chat_completion_to_anthropic_response(
usage
,
}
}
// ---------------------------------------------------------------------------
// Count tokens
// ---------------------------------------------------------------------------
/// Request body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Deserialize)]
pub
struct
AnthropicCountTokensRequest
{
pub
model
:
String
,
pub
messages
:
Vec
<
AnthropicMessage
>
,
#[serde(
default,
skip_serializing_if
=
"Option::is_none"
,
deserialize_with
=
"deserialize_system_prompt"
)]
pub
system
:
Option
<
SystemContent
>
,
#[serde(default)]
pub
tools
:
Option
<
Vec
<
AnthropicTool
>>
,
}
/// Response body for `POST /v1/messages/count_tokens`.
#[derive(Debug,
Clone,
Serialize)]
pub
struct
AnthropicCountTokensResponse
{
pub
input_tokens
:
u32
,
}
impl
AnthropicCountTokensRequest
{
/// Estimate input token count using a `len/3` heuristic.
pub
fn
estimate_tokens
(
&
self
)
->
u32
{
let
mut
total_len
:
usize
=
0
;
if
let
Some
(
system
)
=
&
self
.system
{
total_len
+=
system
.text
.len
();
}
for
msg
in
&
self
.messages
{
// Count role
total_len
+=
match
msg
.role
{
AnthropicRole
::
User
=>
4
,
AnthropicRole
::
Assistant
=>
9
,
};
// Count content
match
&
msg
.content
{
AnthropicMessageContent
::
Text
{
content
}
=>
total_len
+=
content
.len
(),
AnthropicMessageContent
::
Blocks
{
content
}
=>
{
for
block
in
content
{
total_len
+=
estimate_block_len
(
block
);
}
}
}
}
if
let
Some
(
tools
)
=
&
self
.tools
{
for
tool
in
tools
{
total_len
+=
tool
.name
.len
();
if
let
Some
(
desc
)
=
&
tool
.description
{
total_len
+=
desc
.len
();
}
if
let
Some
(
schema
)
=
&
tool
.input_schema
{
total_len
+=
schema
.to_string
()
.len
();
}
}
}
let
tokens
=
total_len
/
3
;
if
tokens
==
0
&&
total_len
>
0
{
1
}
else
{
tokens
as
u32
}
}
}
fn
estimate_block_len
(
block
:
&
AnthropicContentBlock
)
->
usize
{
match
block
{
AnthropicContentBlock
::
Text
{
text
,
..
}
=>
text
.len
(),
AnthropicContentBlock
::
ToolUse
{
name
,
input
,
..
}
=>
name
.len
()
+
input
.to_string
()
.len
(),
AnthropicContentBlock
::
ToolResult
{
content
,
..
}
=>
content
.as_ref
()
.map
(|
c
|
match
c
{
ToolResultContent
::
Text
(
s
)
=>
s
.len
(),
ToolResultContent
::
Blocks
(
blocks
)
=>
blocks
.iter
()
.map
(|
b
|
match
b
{
ToolResultContentBlock
::
Text
{
text
}
=>
text
.len
(),
ToolResultContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
})
.sum
(),
})
.unwrap_or
(
0
),
AnthropicContentBlock
::
Thinking
{
thinking
,
..
}
=>
thinking
.len
(),
AnthropicContentBlock
::
RedactedThinking
{
data
,
..
}
=>
data
.len
(),
AnthropicContentBlock
::
ServerToolUse
{
name
,
input
,
..
}
=>
{
name
.len
()
+
input
.to_string
()
.len
()
}
AnthropicContentBlock
::
WebSearchToolResult
{
content
,
..
}
=>
content
.to_string
()
.len
(),
AnthropicContentBlock
::
Image
{
..
}
=>
256
,
// rough estimate for image metadata
AnthropicContentBlock
::
Other
(
v
)
=>
v
.to_string
()
.len
(),
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod
tests
{
use
super
::
*
;
...
...
@@ -1656,38 +801,40 @@ mod tests {
#[test]
fn
test_chat_completion_to_anthropic_response
()
{
let
chat_resp
=
NvCreateChatCompletionResponse
{
id
:
"chatcmpl-xyz"
.into
(),
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoice
{
index
:
0
,
message
:
dynamo_async_openai
::
types
::
ChatCompletionResponseMessage
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"Hello!"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
"chatcmpl-xyz"
.into
(),
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoice
{
index
:
0
,
message
:
dynamo_async_openai
::
types
::
ChatCompletionResponseMessage
{
content
:
Some
(
dynamo_async_openai
::
types
::
ChatCompletionMessageContent
::
Text
(
"Hello!"
.to_string
(),
),
),
)
,
refusal
:
None
,
tool_calls
:
None
,
role
:
dynamo_async_openai
::
types
::
Role
::
Assistant
,
function_call
:
None
,
audio
:
None
,
reasoning_content
:
None
,
}
,
finish
_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
)
,
stop_reason
:
None
,
logprobs
:
None
,
}]
,
created
:
1726000000
,
model
:
"test-model"
.into
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion"
.to_string
(),
usage
:
Some
(
dynamo_async_openai
::
types
::
CompletionUsage
{
prompt
_tokens
:
10
,
completion
_tokens
:
5
,
total_tokens
:
15
,
prompt
_tokens_details
:
None
,
completion_tokens_details
:
None
,
}
)
,
refusal
:
None
,
tool_calls
:
None
,
role
:
dynamo_async_openai
::
types
::
Role
::
Assistant
,
function_call
:
None
,
audio
:
None
,
reasoning_content
:
None
,
}
,
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
)
,
stop
_reason
:
None
,
logprobs
:
None
,
}]
,
created
:
1726000000
,
model
:
"test-model"
.into
()
,
service_tier
:
None
,
system_fingerprint
:
None
,
object
:
"chat.completion"
.to_string
()
,
usage
:
Some
(
dynamo_async_openai
::
types
::
CompletionUsage
{
prompt_tokens
:
10
,
completion
_tokens
:
5
,
total
_tokens
:
1
5
,
prompt_tokens_details
:
None
,
completion
_tokens_details
:
None
,
})
,
},
nvext
:
None
,
};
...
...
lib/llm/src/protocols/openai/chat_completions.rs
View file @
2887cd1c
...
...
@@ -64,21 +64,24 @@ pub struct NvCreateChatCompletionRequest {
}
/// A response structure for unary chat completion responses, embedding OpenAI's
/// `CreateChatCompletionResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI unary chat completion response, embedded
/// using `serde(flatten)`.
pub
type
NvCreateChatCompletionResponse
=
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
;
/// `CreateChatCompletionResponse` with optional NVIDIA extension metadata.
#[derive(Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
pub
struct
NvCreateChatCompletionResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// A response structure for streamed chat completions, embedding OpenAI's
/// `CreateChatCompletionStreamResponse`.
///
/// # Fields
/// - `inner`: The base OpenAI streaming chat completion response, embedded
/// using `serde(flatten)`.
pub
type
NvCreateChatCompletionStreamResponse
=
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
;
/// `CreateChatCompletionStreamResponse` with optional NVIDIA extension metadata.
#[derive(Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
pub
struct
NvCreateChatCompletionStreamResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
/// Implements `NvExtProvider` for `NvCreateChatCompletionRequest`,
/// providing access to NVIDIA-specific extensions.
...
...
lib/llm/src/protocols/openai/chat_completions/aggregator.rs
View file @
2887cd1c
...
...
@@ -136,16 +136,16 @@ impl DeltaAggregator {
if
aggregator
.error
.is_none
()
&&
let
Some
(
delta
)
=
delta
.data
{
aggregator
.id
=
delta
.id
;
aggregator
.model
=
delta
.model
;
aggregator
.created
=
delta
.created
;
aggregator
.service_tier
=
delta
.service_tier
;
aggregator
.id
=
delta
.
inner.
id
;
aggregator
.model
=
delta
.
inner.
model
;
aggregator
.created
=
delta
.
inner.
created
;
aggregator
.service_tier
=
delta
.
inner.
service_tier
;
// Aggregate usage statistics if available.
if
let
Some
(
usage
)
=
delta
.usage
{
if
let
Some
(
usage
)
=
delta
.
inner.
usage
{
aggregator
.usage
=
Some
(
usage
);
}
if
let
Some
(
system_fingerprint
)
=
delta
.system_fingerprint
{
if
let
Some
(
system_fingerprint
)
=
delta
.
inner.
system_fingerprint
{
aggregator
.system_fingerprint
=
Some
(
system_fingerprint
);
}
...
...
@@ -155,7 +155,7 @@ impl DeltaAggregator {
}
// Aggregate choices incrementally.
for
choice
in
delta
.choices
{
for
choice
in
delta
.
inner.
choices
{
let
state_choice
=
aggregator
.choices
...
...
@@ -267,14 +267,16 @@ impl DeltaAggregator {
// Construct the final response object.
let
response
=
NvCreateChatCompletionResponse
{
id
:
aggregator
.id
,
created
:
aggregator
.created
,
usage
:
aggregator
.usage
,
model
:
aggregator
.model
,
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
service_tier
:
aggregator
.service_tier
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
id
:
aggregator
.id
,
created
:
aggregator
.created
,
usage
:
aggregator
.usage
,
model
:
aggregator
.model
,
object
:
"chat.completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
service_tier
:
aggregator
.service_tier
,
},
nvext
:
aggregator
.nvext
,
};
...
...
@@ -360,7 +362,7 @@ pub trait ChatCompletionAggregator {
)
->
Result
<
NvCreateChatCompletionResponse
,
String
>
;
}
impl
ChatCompletionAggregator
for
dynamo_async_openai
::
types
::
CreateChatCompletionResponse
{
impl
ChatCompletionAggregator
for
Nv
CreateChatCompletionResponse
{
async
fn
from_annotated_stream
(
stream
:
impl
Stream
<
Item
=
Annotated
<
NvCreateChatCompletionStreamResponse
>>
,
parsing_options
:
ParsingOptions
,
...
...
@@ -445,14 +447,16 @@ mod tests {
};
let
data
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"meta/llama-3.1-8b-instruct"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
choice
],
object
:
"chat.completion"
.to_string
(),
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"meta/llama-3.1-8b-instruct"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
choice
],
object
:
"chat.completion"
.to_string
(),
},
nvext
:
None
,
};
...
...
@@ -479,13 +483,13 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify that the response is empty and has default values
assert_eq!
(
response
.id
,
""
);
assert_eq!
(
response
.model
,
""
);
assert_eq!
(
response
.created
,
0
);
assert
!
(
response
.usage
.is_none
());
assert
!
(
response
.system_fingerprint
.is_none
());
assert_eq!
(
response
.choices
.len
(),
0
);
assert
!
(
response
.service_tier
.is_none
());
assert_eq!
(
response
.
inner.
id
,
""
);
assert_eq!
(
response
.
inner.
model
,
""
);
assert_eq!
(
response
.
inner.
created
,
0
);
assert
!
(
response
.
inner.
usage
.is_none
());
assert
!
(
response
.
inner.
system_fingerprint
.is_none
());
assert_eq!
(
response
.
inner.
choices
.len
(),
0
);
assert
!
(
response
.
inner.
service_tier
.is_none
());
}
#[tokio::test]
...
...
@@ -511,13 +515,13 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.id
,
"test_id"
);
assert_eq!
(
response
.model
,
"meta/llama-3.1-8b-instruct"
);
assert_eq!
(
response
.created
,
1234567890
);
assert
!
(
response
.usage
.is_none
());
assert
!
(
response
.system_fingerprint
.is_none
());
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
id
,
"test_id"
);
assert_eq!
(
response
.
inner.
model
,
"meta/llama-3.1-8b-instruct"
);
assert_eq!
(
response
.
inner.
created
,
1234567890
);
assert
!
(
response
.
inner.
usage
.is_none
());
assert
!
(
response
.
inner.
system_fingerprint
.is_none
());
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
choice
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -525,7 +529,7 @@ mod tests {
);
assert
!
(
choice
.finish_reason
.is_none
());
assert_eq!
(
choice
.message.role
,
dynamo_async_openai
::
types
::
Role
::
User
);
assert
!
(
response
.service_tier
.is_none
());
assert
!
(
response
.
inner.
service_tier
.is_none
());
}
#[tokio::test]
...
...
@@ -562,8 +566,8 @@ mod tests {
let
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
choice
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -630,8 +634,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice
.index
,
0
);
assert_eq!
(
...
...
@@ -653,43 +657,49 @@ mod tests {
// Create a delta with multiple choices
// ALLOW: function_call is deprecated
let
data
=
NvCreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"test_model"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
0
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 0"
.to_string
())),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
"test_id"
.to_string
(),
model
:
"test_model"
.to_string
(),
created
:
1234567890
,
service_tier
:
None
,
usage
:
None
,
system_fingerprint
:
None
,
choices
:
vec!
[
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
0
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 0"
.to_string
(),
)),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
1
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 1"
.to_string
())),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
dynamo_async_openai
::
types
::
ChatChoiceStream
{
index
:
1
,
delta
:
dynamo_async_openai
::
types
::
ChatCompletionStreamResponseDelta
{
role
:
Some
(
dynamo_async_openai
::
types
::
Role
::
Assistant
),
content
:
Some
(
ChatCompletionMessageContent
::
Text
(
"Choice 1"
.to_string
(),
)),
function_call
:
None
,
tool_calls
:
None
,
refusal
:
None
,
reasoning_content
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
finish_reason
:
Some
(
dynamo_async_openai
::
types
::
FinishReason
::
Stop
),
stop_reason
:
None
,
logprobs
:
None
,
},
],
object
:
"chat.completion"
.to_string
(),
],
object
:
"chat.completion"
.to_string
(),
},
nvext
:
None
,
};
...
...
@@ -711,9 +721,9 @@ mod tests {
let
mut
response
=
result
.unwrap
();
// Verify the response fields
assert_eq!
(
response
.choices
.len
(),
2
);
response
.choices
.sort_by
(|
a
,
b
|
a
.index
.cmp
(
&
b
.index
));
// Ensure the choices are ordered
let
choice0
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
2
);
response
.
inner.
choices
.sort_by
(|
a
,
b
|
a
.index
.cmp
(
&
b
.index
));
// Ensure the choices are ordered
let
choice0
=
&
response
.
inner.
choices
[
0
];
assert_eq!
(
choice0
.index
,
0
);
assert_eq!
(
choice0
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -728,7 +738,7 @@ mod tests {
dynamo_async_openai
::
types
::
Role
::
Assistant
);
let
choice1
=
&
response
.choices
[
1
];
let
choice1
=
&
response
.
inner.
choices
[
1
];
assert_eq!
(
choice1
.index
,
1
);
assert_eq!
(
choice1
.message.content
.as_ref
()
.unwrap
(),
...
...
@@ -773,8 +783,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -816,8 +826,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -859,8 +869,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls are present
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -900,8 +910,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify no tool calls are present
assert
!
(
choice
.message.tool_calls
.is_none
());
...
...
@@ -928,7 +938,7 @@ mod tests {
// Manually set empty tool calls array
if
let
Some
(
ref
mut
data
)
=
annotated_delta
.data
{
data
.choices
[
0
]
.delta.tool_calls
=
Some
(
vec!
[]);
// Empty tool calls array
data
.
inner.
choices
[
0
]
.delta.tool_calls
=
Some
(
vec!
[]);
// Empty tool calls array
}
let
data
=
annotated_delta
.data
.unwrap
();
...
...
@@ -945,8 +955,8 @@ mod tests {
assert
!
(
result
.is_ok
());
let
response
=
result
.unwrap
();
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// Verify tool calls array is empty
assert
!
(
choice
.message.tool_calls
.is_none
());
...
...
@@ -992,8 +1002,8 @@ mod tests {
let
response
=
result
.unwrap
();
// There should be one choice
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// The tool_calls field should be present and parsed
assert
!
(
choice
.message.tool_calls
.is_some
());
...
...
@@ -1050,8 +1060,8 @@ mod tests {
let
response
=
result
.unwrap
();
// There should be one choice
assert_eq!
(
response
.choices
.len
(),
1
);
let
choice
=
&
response
.choices
[
0
];
assert_eq!
(
response
.
inner.
choices
.len
(),
1
);
let
choice
=
&
response
.
inner.
choices
[
0
];
// The finish_reason should be ToolCalls, not Stop, because tool calls are present
assert_eq!
(
...
...
lib/llm/src/protocols/openai/chat_completions/delta.rs
View file @
2887cd1c
...
...
@@ -278,19 +278,21 @@ impl DeltaGenerator {
// According to OpenAI spec: when stream_options.include_usage is true,
// all intermediate chunks should have usage: null
// The final usage chunk will be sent separately with empty choices
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
,
usage
:
if
self
.options.enable_usage
&&
self
.options.continuous_usage_stats
{
Some
(
self
.get_usage
())
}
else
{
None
NvCreateChatCompletionStreamResponse
{
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
,
usage
:
if
self
.options.enable_usage
&&
self
.options.continuous_usage_stats
{
Some
(
self
.get_usage
())
}
else
{
None
},
service_tier
:
self
.service_tier
.clone
(),
},
service_tier
:
self
.service_tier
.clone
(),
nvext
:
None
,
// Will be populated by router layer if needed
}
}
...
...
@@ -303,15 +305,17 @@ impl DeltaGenerator {
pub
fn
create_usage_chunk
(
&
self
)
->
NvCreateChatCompletionStreamResponse
{
let
usage
=
self
.get_usage
();
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
service_tier
:
self
.service_tier
.clone
(),
NvCreateChatCompletionStreamResponse
{
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
self
.id
.clone
(),
object
:
self
.object
.clone
(),
created
:
self
.created
,
model
:
self
.model
.clone
(),
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
service_tier
:
self
.service_tier
.clone
(),
},
nvext
:
None
,
}
}
...
...
lib/llm/src/protocols/openai/chat_completions/jail.rs
View file @
2887cd1c
...
...
@@ -525,13 +525,13 @@ impl JailedStream {
// Process each item in the stream
while
let
Some
(
response
)
=
stream
.next
()
.await
{
if
let
Some
(
chat_response
)
=
response
.data
.as_ref
()
{
last_stream_id
.clone_from
(
&
chat_response
.id
);
last_stream_model
.clone_from
(
&
chat_response
.model
);
last_stream_created
=
chat_response
.created
;
last_stream_id
.clone_from
(
&
chat_response
.
inner.
id
);
last_stream_model
.clone_from
(
&
chat_response
.
inner.
model
);
last_stream_created
=
chat_response
.
inner.
created
;
let
mut
all_emissions
=
Vec
::
new
();
if
chat_response
.choices
.is_empty
()
{
if
chat_response
.
inner.
choices
.is_empty
()
{
// No choices processed (e.g., usage-only chunk)
// Pass through as-is to preserve usage and other metadata
yield
response
;
...
...
@@ -539,7 +539,7 @@ impl JailedStream {
}
// Process each choice independently using the new architecture
for
choice
in
&
chat_response
.choices
{
for
choice
in
&
chat_response
.
inner.
choices
{
if
let
Some
(
ref
content
)
=
choice
.delta.content
{
// Jailing only applies to text content
let
text_content
=
match
content
{
...
...
@@ -676,14 +676,16 @@ impl JailedStream {
tracing
::
debug!
(
"Stream ended while jailed, releasing accumulated content"
);
// Create a finalization response carrying forward real stream metadata
let
dummy_response
=
NvCreateChatCompletionStreamResponse
{
id
:
last_stream_id
,
object
:
"chat.completion.chunk"
.to_string
(),
created
:
last_stream_created
,
model
:
last_stream_model
,
choices
:
Vec
::
new
(),
usage
:
None
,
service_tier
:
None
,
system_fingerprint
:
None
,
inner
:
dynamo_async_openai
::
types
::
CreateChatCompletionStreamResponse
{
id
:
last_stream_id
,
object
:
"chat.completion.chunk"
.to_string
(),
created
:
last_stream_created
,
model
:
last_stream_model
,
choices
:
Vec
::
new
(),
usage
:
None
,
service_tier
:
None
,
system_fingerprint
:
None
,
},
nvext
:
None
,
};
...
...
@@ -713,7 +715,7 @@ impl JailedStream {
EmissionMode
::
Packed
=>
{
// Pack all choices into a single response
let
mut
response
=
base_response
.clone
();
response
.choices
=
emissions
.into_iter
()
.map
(|
e
|
e
.into_choice
())
.collect
();
response
.
inner.
choices
=
emissions
.into_iter
()
.map
(|
e
|
e
.into_choice
())
.collect
();
vec!
[
Annotated
{
data
:
Some
(
response
),
...
...
@@ -729,7 +731,7 @@ impl JailedStream {
.into_iter
()
.map
(|
emission
|
{
let
mut
response
=
base_response
.clone
();
response
.choices
=
vec!
[
emission
.into_choice
()];
response
.
inner.
choices
=
vec!
[
emission
.into_choice
()];
Annotated
{
data
:
Some
(
response
),
...
...
@@ -1013,7 +1015,7 @@ impl JailedStream {
while
let
Some
(
mut
response
)
=
input_stream
.next
()
.await
{
// Track if any choice emitted tool calls
if
let
Some
(
ref
data
)
=
response
.data
{
for
choice
in
&
data
.choices
{
for
choice
in
&
data
.
inner.
choices
{
if
choice
.delta.tool_calls
.is_some
()
{
has_tool_calls_per_choice
.insert
(
choice
.index
,
true
);
}
...
...
@@ -1022,7 +1024,7 @@ impl JailedStream {
// Fix finish_reason based on jail mode and whether tool calls were emitted
if
let
Some
(
ref
mut
data
)
=
response
.data
{
for
choice
in
&
mut
data
.choices
{
for
choice
in
&
mut
data
.
inner.
choices
{
if
let
Some
(
finish
)
=
choice
.finish_reason
{
// Only modify Stop finish reason, preserve Length/ContentFilter
if
finish
==
FinishReason
::
Stop
{
...
...
lib/llm/src/protocols/openai/completions.rs
View file @
2887cd1c
...
...
@@ -48,6 +48,8 @@ pub struct NvCreateCompletionRequest {
pub
struct
NvCreateCompletionResponse
{
#[serde(flatten)]
pub
inner
:
dynamo_async_openai
::
types
::
CreateCompletionResponse
,
#[serde(skip_serializing_if
=
"Option::is_none"
)]
pub
nvext
:
Option
<
serde_json
::
Value
>
,
}
impl
ContentProvider
for
dynamo_async_openai
::
types
::
Choice
{
...
...
@@ -296,9 +298,8 @@ impl ResponseFactory {
choices
:
vec!
[
choice
],
system_fingerprint
:
self
.system_fingerprint
.clone
(),
usage
,
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
}
...
...
lib/llm/src/protocols/openai/completions/aggregator.rs
View file @
2887cd1c
...
...
@@ -86,8 +86,8 @@ impl DeltaAggregator {
aggregator
.system_fingerprint
=
Some
(
system_fingerprint
);
}
// Aggregate nvext field (take the last non-None value)
if
delta
.
inner.
nvext
.is_some
()
{
aggregator
.nvext
=
delta
.
inner.
nvext
;
if
delta
.nvext
.is_some
()
{
aggregator
.nvext
=
delta
.nvext
;
}
// handle the choices
...
...
@@ -168,10 +168,12 @@ impl DeltaAggregator {
object
:
"text_completion"
.to_string
(),
system_fingerprint
:
aggregator
.system_fingerprint
,
choices
,
nvext
:
aggregator
.nvext
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
aggregator
.nvext
,
};
Ok
(
response
)
}
...
...
@@ -256,10 +258,9 @@ mod tests {
logprobs
,
}],
object
:
"text_completion"
.to_string
(),
nvext
:
None
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
None
};
Annotated
{
data
:
Some
(
response
),
...
...
@@ -387,10 +388,9 @@ mod tests {
},
],
object
:
"text_completion"
.to_string
(),
nvext
:
None
,
};
let
response
=
NvCreateCompletionResponse
{
inner
};
let
response
=
NvCreateCompletionResponse
{
inner
,
nvext
:
None
};
let
annotated_delta
=
Annotated
{
data
:
Some
(
response
),
...
...
lib/llm/src/protocols/openai/completions/delta.rs
View file @
2887cd1c
...
...
@@ -218,10 +218,9 @@ impl DeltaGenerator {
}
else
{
None
},
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
/// Creates a final usage-only chunk for OpenAI compliance.
...
...
@@ -240,10 +239,9 @@ impl DeltaGenerator {
system_fingerprint
:
self
.system_fingerprint
.clone
(),
choices
:
vec!
[],
// Empty choices for usage-only chunk
usage
:
Some
(
usage
),
nvext
:
None
,
// Will be populated by router layer if needed
};
NvCreateCompletionResponse
{
inner
}
NvCreateCompletionResponse
{
inner
,
nvext
:
None
}
}
/// Check if usage tracking is enabled
...
...
@@ -343,7 +341,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
};
if
let
Ok
(
nvext_json
)
=
serde_json
::
to_value
(
&
nvext_response
)
{
response
.
inner.
nvext
=
Some
(
nvext_json
);
response
.nvext
=
Some
(
nvext_json
);
if
let
Some
(
ref
info
)
=
worker_id_info
{
tracing
::
debug!
(
"Injected worker_id into completions nvext: prefill={:?}, decode={:?}"
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment