Unverified Commit 3bfee568 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

feat: unified internal request representation for lossless API conversion (#7202)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
Signed-off-by: default avatarMarko Kosec <mkosec@nvidia.com>
parent 8fe2082c
......@@ -40,9 +40,10 @@ use crate::protocols::anthropic::types::{
chat_completion_to_anthropic_response,
};
use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
NvCreateChatCompletionStreamResponse, aggregator::ChatCompletionAggregator,
NvCreateChatCompletionResponse, NvCreateChatCompletionStreamResponse,
aggregator::ChatCompletionAggregator,
};
use crate::protocols::unified::UnifiedRequest;
use crate::request_template::RequestTemplate;
use crate::types::Annotated;
......@@ -213,20 +214,25 @@ async fn anthropic_messages(
.as_ref()
.is_some_and(|t| t.thinking_type == "disabled");
// Convert Anthropic request -> Chat Completion request
let mut chat_request: NvCreateChatCompletionRequest =
orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!(
request_id,
error = %e,
"Failed to convert AnthropicCreateMessageRequest to NvCreateChatCompletionRequest",
);
anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
&format!("Failed to convert request: {}", e),
)
})?;
// Convert Anthropic request -> UnifiedRequest -> Chat Completion request
let unified_request: UnifiedRequest = orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!(
request_id,
error = %e,
"Failed to convert AnthropicCreateMessageRequest to UnifiedRequest",
);
anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
&format!("Failed to convert request: {}", e),
)
})?;
// Extract the API context before consuming the UnifiedRequest — this
// carries Anthropic-specific fields (thinking config, cache breakpoints,
// etc.) that the stream converter needs for faithful response reconstruction.
let anthropic_ctx = unified_request.anthropic_context().cloned();
let mut chat_request = unified_request.into_inner();
// When a reasoning parser is configured and the client hasn't explicitly
// disabled thinking, assume the model's chat template will inject `<think>`.
......@@ -309,7 +315,10 @@ async fn anthropic_messages(
use std::sync::atomic::{AtomicBool, Ordering};
let mut converter = AnthropicStreamConverter::new(model_for_resp);
let mut converter = match anthropic_ctx {
Some(ctx) => AnthropicStreamConverter::with_context(model_for_resp, ctx),
None => AnthropicStreamConverter::new(model_for_resp),
};
let start_events = converter.emit_start_events();
let converter = std::sync::Arc::new(std::sync::Mutex::new(converter));
......@@ -406,7 +415,11 @@ async fn anthropic_messages(
)
})?;
let response = chat_completion_to_anthropic_response(chat_response, &model_for_resp);
let response = chat_completion_to_anthropic_response(
chat_response,
&model_for_resp,
anthropic_ctx.as_ref(),
);
inflight_guard.mark_ok();
......
......@@ -57,6 +57,7 @@ use crate::protocols::openai::{
responses::{NvCreateResponse, NvResponse, ResponseParams, chat_completion_to_response},
videos::{NvCreateVideoRequest, NvVideosResponse},
};
use crate::protocols::unified::UnifiedRequest;
use crate::request_template::RequestTemplate;
use crate::types::Annotated;
use dynamo_runtime::logging::get_distributed_tracing_context;
......@@ -1513,21 +1514,25 @@ async fn responses(
let request_id = request.id().to_string();
let (orig_request, context) = request.into_parts();
let mut chat_request: NvCreateChatCompletionRequest =
orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!(
request_id,
error = %e,
"Failed to convert NvCreateResponse to NvCreateChatCompletionRequest",
);
let err_response = ErrorMessage::not_implemented_error(
VALIDATION_PREFIX.to_string()
+ "Failed to convert responses request: "
+ &e.to_string(),
);
inflight_guard.mark_error(extract_error_type_from_response(&err_response));
err_response
})?;
let unified_request: UnifiedRequest = orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!(
request_id,
error = %e,
"Failed to convert NvCreateResponse to UnifiedRequest",
);
let err_response = ErrorMessage::not_implemented_error(
VALIDATION_PREFIX.to_string()
+ "Failed to convert responses request: "
+ &e.to_string(),
);
inflight_guard.mark_error(extract_error_type_from_response(&err_response));
err_response
})?;
// Extract the API context before consuming the UnifiedRequest — this
// carries Responses-specific fields (previous_response_id, store, etc.)
// that the stream converter needs for faithful response reconstruction.
let responses_ctx = unified_request.responses_context().cloned();
let mut chat_request = unified_request.into_inner();
// Always use internal streaming for aggregation.
// Set stream_options.include_usage so the backend sends token counts in the final chunk.
......@@ -1577,7 +1582,10 @@ async fn responses(
use crate::protocols::openai::responses::stream_converter::ResponseStreamConverter;
use std::sync::atomic::{AtomicBool, Ordering};
let mut converter = ResponseStreamConverter::new(model.clone(), response_params);
let mut converter = match responses_ctx {
Some(ctx) => ResponseStreamConverter::with_context(model.clone(), response_params, ctx),
None => ResponseStreamConverter::new(model.clone(), response_params),
};
let start_events = converter.emit_start_events();
// Use std::sync::Mutex (not tokio) since process_chunk/emit_end_events are
......@@ -1685,18 +1693,19 @@ async fn responses(
})?;
// Convert NvCreateChatCompletionResponse --> NvResponse
let response: NvResponse = chat_completion_to_response(response, &response_params)
.map_err(|e| {
tracing::error!(
request_id,
"Failed to convert NvCreateChatCompletionResponse to NvResponse: {:?}",
e
);
let err_response =
ErrorMessage::internal_server_error("Failed to convert internal response");
inflight_guard.mark_error(extract_error_type_from_response(&err_response));
err_response
})?;
let response: NvResponse =
chat_completion_to_response(response, &response_params, responses_ctx.as_ref())
.map_err(|e| {
tracing::error!(
request_id,
"Failed to convert NvCreateChatCompletionResponse to NvResponse: {:?}",
e
);
let err_response =
ErrorMessage::internal_server_error("Failed to convert internal response");
inflight_guard.mark_error(extract_error_type_from_response(&err_response));
err_response
})?;
inflight_guard.mark_ok();
// If the engine context was killed (client disconnect), the response was
......
......@@ -15,6 +15,7 @@ pub mod codec;
pub mod common;
pub mod openai;
pub mod tensor;
pub(crate) mod unified;
/// The token ID type
pub type TokenIdType = u32;
......
......@@ -18,11 +18,14 @@ use super::types::{
AnthropicResponseContentBlock, AnthropicStopReason, AnthropicStreamEvent, AnthropicUsage,
};
use crate::protocols::openai::chat_completions::NvCreateChatCompletionStreamResponse;
use crate::protocols::unified::AnthropicContext;
/// State machine that converts a chat completion stream into Anthropic SSE events.
pub struct AnthropicStreamConverter {
model: String,
message_id: String,
/// Preserved Anthropic-specific request context for faithful response reconstruction.
api_context: Option<AnthropicContext>,
// Thinking/reasoning tracking
thinking_block_started: bool,
thinking_block_closed: bool,
......@@ -60,6 +63,7 @@ impl AnthropicStreamConverter {
Self {
model,
message_id: format!("msg_{}", Uuid::new_v4().simple()),
api_context: None,
thinking_block_started: false,
thinking_block_closed: false,
thinking_block_index: 0,
......@@ -76,8 +80,19 @@ impl AnthropicStreamConverter {
}
}
/// Create a converter seeded with the original Anthropic request context.
/// This allows the response stream to carry forward metadata that was lost
/// during the Anthropic-to-OpenAI request conversion.
pub fn with_context(model: String, context: AnthropicContext) -> Self {
let mut converter = Self::new(model);
converter.api_context = Some(context);
converter
}
/// Emit the initial `message_start` event.
pub fn emit_start_events(&mut self) -> Vec<Result<Event, anyhow::Error>> {
// TODO: When AnthropicMessageResponse gains a `service_tier` field,
// populate it from `self.api_context` (if the original request specified one).
let message = AnthropicMessageResponse {
id: self.message_id.clone(),
object_type: "message".to_string(),
......@@ -182,6 +197,11 @@ impl AnthropicStreamConverter {
// Emit signature delta to close the thinking block.
// The engine doesn't produce Anthropic-style cryptographic signatures,
// so we use "erased" (the standard placeholder per the Anthropic spec).
// When `api_context` is available and the original request had
// `thinking.thinking_type == "enabled"`, this is expected — the backend
// simply doesn't generate real signatures. If/when the backend starts
// returning real signatures, we can use the context to validate or
// pass them through instead of hardcoding "erased".
let sig_delta = AnthropicStreamEvent::ContentBlockDelta {
index: self.thinking_block_index,
delta: AnthropicDelta::SignatureDelta {
......@@ -1071,4 +1091,35 @@ mod tests {
"no block stops in end events"
);
}
/// Verify that `with_context` stores the context and produces the same
/// event structure as `new` — the context is carried for future enrichment.
#[test]
fn test_with_context_preserves_context() {
use crate::protocols::unified::AnthropicContext;
let ctx = AnthropicContext {
service_tier: Some("priority".to_string()),
..Default::default()
};
let mut conv = AnthropicStreamConverter::with_context("test-model".into(), ctx);
assert!(conv.api_context.is_some());
assert_eq!(
conv.api_context.as_ref().unwrap().service_tier.as_deref(),
Some("priority")
);
// Should produce the same events as a regular converter
let ev = conv.process_chunk_tagged(&text_chunk("Hello"));
assert_eq!(
event_types(&ev),
vec!["content_block_start", "content_block_delta"]
);
let end = conv.emit_end_events_tagged();
assert_eq!(
event_types(&end),
vec!["content_block_stop", "message_delta", "message_stop"]
);
}
}
......@@ -120,7 +120,10 @@ impl TryFrom<AnthropicCreateMessageRequest> for NvCreateChatCompletionRequest {
..Default::default()
},
nvext: {
// Collect per-block cache_control: use the last one found
// Lossy: collapse all per-block cache_control into a single
// last-one-wins value. Sufficient for backends with a single
// prefix cache boundary. Full per-block breakpoints are
// preserved in AnthropicContext::cache_breakpoints via UnifiedRequest.
let mut last_block_cc: Option<CacheControl> = None;
for msg in &req.messages {
if let AnthropicMessageContent::Blocks { content } = &msg.content {
......@@ -472,7 +475,9 @@ fn convert_anthropic_tool_choice(tc: &AnthropicToolChoice) -> ChatCompletionTool
pub fn chat_completion_to_anthropic_response(
chat_resp: NvCreateChatCompletionResponse,
model: &str,
api_context: Option<&crate::protocols::unified::AnthropicContext>,
) -> AnthropicMessageResponse {
let _ = api_context; // Available for future enrichment (service_tier, etc.)
let msg_id = format!("msg_{}", Uuid::new_v4().simple());
let choice = chat_resp.inner.choices.into_iter().next();
......@@ -853,7 +858,7 @@ mod tests {
nvext: None,
};
let response = chat_completion_to_anthropic_response(chat_resp, "test-model");
let response = chat_completion_to_anthropic_response(chat_resp, "test-model", None);
assert!(response.id.starts_with("msg_"));
assert_eq!(response.object_type, "message");
assert_eq!(response.role, "assistant");
......
......@@ -37,7 +37,7 @@ pub struct AnnotatedDelta<R> {
pub comment: Option<String>,
}
trait OpenAISamplingOptionsProvider {
pub(crate) trait OpenAISamplingOptionsProvider {
fn get_temperature(&self) -> Option<f32>;
fn get_top_p(&self) -> Option<f32>;
......@@ -55,7 +55,7 @@ trait OpenAISamplingOptionsProvider {
fn nvext(&self) -> Option<&nvext::NvExt>;
}
trait OpenAIStopConditionsProvider {
pub(crate) trait OpenAIStopConditionsProvider {
fn get_max_tokens(&self) -> Option<u32>;
fn get_min_tokens(&self) -> Option<u32>;
......@@ -82,7 +82,7 @@ trait OpenAIStopConditionsProvider {
}
}
trait OpenAIOutputOptionsProvider {
pub(crate) trait OpenAIOutputOptionsProvider {
fn get_logprobs(&self) -> Option<u32>;
fn get_prompt_logprobs(&self) -> Option<u32>;
......
......@@ -695,6 +695,7 @@ fn make_function_call(name: String, arguments: String) -> OutputItem {
pub fn chat_completion_to_response(
nv_resp: NvCreateChatCompletionResponse,
params: &ResponseParams,
api_context: Option<&crate::protocols::unified::ResponsesContext>,
) -> Result<NvResponse, anyhow::Error> {
let nvext = nv_resp.nvext.clone();
let chat_resp = nv_resp.inner;
......@@ -814,7 +815,10 @@ pub fn chat_completion_to_response(
presence_penalty: Some(0.0),
// Echo actual request values, falling back to spec defaults.
// store: false because this branch does not persist responses.
store: params.store.or(Some(false)),
store: api_context
.map(|ctx| ctx.store)
.or(params.store)
.or(Some(false)),
temperature: params.temperature.or(Some(1.0)),
text: Some(params.text.clone().unwrap_or(ResponseTextParam {
format: TextResponseFormatConfiguration::Text,
......@@ -841,7 +845,7 @@ pub fn chat_completion_to_response(
instructions: params.instructions.clone().map(Instructions::Text),
max_output_tokens: params.max_output_tokens,
max_tool_calls: None,
previous_response_id: None,
previous_response_id: api_context.and_then(|ctx| ctx.previous_response_id.clone()),
prompt: None,
prompt_cache_key: None,
prompt_cache_retention: None,
......@@ -1194,7 +1198,8 @@ mod tests {
nvext: None,
};
let wrapped = chat_completion_to_response(chat_resp, &ResponseParams::default()).unwrap();
let wrapped =
chat_completion_to_response(chat_resp, &ResponseParams::default(), None).unwrap();
assert_eq!(wrapped.inner.model, "llama-3.1-8b-instruct");
assert_eq!(wrapped.inner.status, Status::Completed);
......@@ -1254,7 +1259,8 @@ mod tests {
nvext: None,
};
let wrapped = chat_completion_to_response(chat_resp, &ResponseParams::default()).unwrap();
let wrapped =
chat_completion_to_response(chat_resp, &ResponseParams::default(), None).unwrap();
assert_eq!(wrapped.inner.output.len(), 1);
match &wrapped.inner.output[0] {
OutputItem::FunctionCall(fc) => {
......@@ -1449,7 +1455,7 @@ thinking
nvext: None,
};
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
let reasoning = resp.inner.reasoning.unwrap();
assert_eq!(reasoning.effort, Some(ReasoningEffort::High));
}
......@@ -1482,7 +1488,7 @@ thinking
nvext: None,
};
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
let text = resp.inner.text.unwrap();
assert_eq!(text.format, TextResponseFormatConfiguration::JsonObject);
}
......@@ -1510,7 +1516,7 @@ thinking
nvext: None,
};
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
assert_eq!(resp.inner.service_tier, Some(ServiceTier::Flex));
}
......@@ -1598,7 +1604,7 @@ thinking
fn test_include_logprobs_stripped_by_default() {
let chat_resp = make_chat_resp_with_text("hello");
let params = ResponseParams::default();
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
for item in &resp.inner.output {
if let OutputItem::Message(msg) = item {
......@@ -1623,7 +1629,7 @@ thinking
include: Some(vec![IncludeEnum::MessageOutputTextLogprobs]),
..Default::default()
};
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
let mut found_text = false;
for item in &resp.inner.output {
......@@ -1651,7 +1657,7 @@ thinking
truncation: Some(Truncation::Auto),
..Default::default()
};
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
assert_eq!(resp.inner.truncation, Some(Truncation::Auto));
}
......@@ -1659,7 +1665,7 @@ thinking
fn test_truncation_defaults_to_disabled() {
let chat_resp = make_chat_resp_with_text("hello");
let params = ResponseParams::default();
let resp = chat_completion_to_response(chat_resp, &params).unwrap();
let resp = chat_completion_to_response(chat_resp, &params, None).unwrap();
assert_eq!(resp.inner.truncation, Some(Truncation::Disabled));
}
}
......@@ -28,12 +28,15 @@ use dynamo_async_openai::types::ChatCompletionMessageContent;
use super::ResponseParams;
use crate::protocols::openai::chat_completions::NvCreateChatCompletionStreamResponse;
use crate::protocols::unified::ResponsesContext;
/// State machine that converts a chat completion stream into Responses API events.
pub struct ResponseStreamConverter {
response_id: String,
model: String,
params: ResponseParams,
/// Preserved Responses API-specific request context for faithful response reconstruction.
api_context: Option<ResponsesContext>,
created_at: u64,
sequence_number: u64,
// Text message tracking
......@@ -72,6 +75,7 @@ impl ResponseStreamConverter {
response_id: format!("resp_{}", Uuid::new_v4().simple()),
model,
params,
api_context: None,
created_at,
sequence_number: 0,
message_item_id: format!("msg_{}", Uuid::new_v4().simple()),
......@@ -84,6 +88,12 @@ impl ResponseStreamConverter {
}
}
pub fn with_context(model: String, params: ResponseParams, context: ResponsesContext) -> Self {
let mut converter = Self::new(model, params);
converter.api_context = Some(context);
converter
}
fn next_seq(&mut self) -> u64 {
let seq = self.sequence_number;
self.sequence_number += 1;
......@@ -116,7 +126,12 @@ impl ResponseStreamConverter {
parallel_tool_calls: Some(true),
presence_penalty: Some(0.0),
// store: false because this branch does not persist responses.
store: self.params.store.or(Some(false)),
store: self
.api_context
.as_ref()
.map(|ctx| ctx.store)
.or(self.params.store)
.or(Some(false)),
temperature: self.params.temperature.or(Some(1.0)),
text: Some(self.params.text.clone().unwrap_or(ResponseTextParam {
format: TextResponseFormatConfiguration::Text,
......@@ -144,7 +159,10 @@ impl ResponseStreamConverter {
instructions: self.params.instructions.clone().map(Instructions::Text),
max_output_tokens: self.params.max_output_tokens,
max_tool_calls: None,
previous_response_id: None,
previous_response_id: self
.api_context
.as_ref()
.and_then(|ctx| ctx.previous_response_id.clone()),
prompt: None,
prompt_cache_key: None,
prompt_cache_retention: None,
......@@ -654,6 +672,7 @@ fn get_event_type(event: &ResponseStreamEvent) -> &'static str {
#[cfg(test)]
mod tests {
use super::*;
use crate::protocols::unified::ResponsesContext;
use dynamo_async_openai::types::{
ChatChoiceStream, ChatCompletionMessageContent, ChatCompletionMessageToolCallChunk,
ChatCompletionStreamResponseDelta, ChatCompletionToolType, FunctionCallStream,
......@@ -912,4 +931,41 @@ mod tests {
"output_item.done inline after text: {tool_types:?}"
);
}
/// Verify that `with_context` populates `previous_response_id` and `store`
/// in the generated Response objects.
#[test]
fn test_with_context_enriches_response() {
let ctx = ResponsesContext {
previous_response_id: Some("resp_prev_123".to_string()),
store: true,
..Default::default()
};
let params = ResponseParams::default();
let mut conv = ResponseStreamConverter::with_context("test-model".into(), params, ctx);
// Process one text chunk so there's output
let _ = conv.emit_start_events();
let _ = conv.process_chunk(&text_chunk("Hello"));
let _end_events = conv.emit_end_events();
// Verify the Response object carries the context values through
let response = conv.make_response(Status::Completed, vec![]);
assert_eq!(
response.previous_response_id.as_deref(),
Some("resp_prev_123")
);
assert_eq!(response.store, Some(true));
}
/// Without context, previous_response_id is None and store defaults to false.
#[test]
fn test_without_context_defaults() {
let params = ResponseParams::default();
let conv = ResponseStreamConverter::new("test-model".into(), params);
let response = conv.make_response(Status::Completed, vec![]);
assert_eq!(response.previous_response_id, None);
assert_eq!(response.store, Some(false));
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Unified internal request representation.
//!
//! `UnifiedRequest` is an API-agnostic wrapper that carries a fully-converted
//! `NvCreateChatCompletionRequest` alongside the API-specific context that
//! would otherwise be lost during the fan-in conversion.
//!
//! # Motivation
//!
//! Dynamo's HTTP frontend uses an hourglass architecture: multiple API surfaces
//! (Chat Completions, Anthropic Messages, Responses) fan in through `TryFrom`
//! to `NvCreateChatCompletionRequest`. Non-OpenAI features are lossy-compressed
//! or silently dropped during this conversion. `UnifiedRequest` preserves that
//! context so it can flow through the preprocessor and be used on the response
//! path for faithful reconstruction.
//!
//! # Architecture
//!
//! ```text
//! Anthropic Messages ──┐
//! OpenAI Responses ────┼──→ UnifiedRequest { inner: NvCreateChatCompletion, api_context, ... }
//! OpenAI Chat ─────────┘ │
//! ↓
//! PreprocessedRequest ──→ Backend
//! ```
//!
//! The existing preprocessor pipeline is unchanged — `UnifiedRequest` implements
//! all the same traits (`OAIChatLikeRequest`, `SamplingOptionsProvider`, etc.)
//! by delegating to the inner `NvCreateChatCompletionRequest`. The additional
//! context fields are carried through for response-path use.
use std::collections::HashMap;
use dynamo_runtime::protocols::annotated::AnnotationsProvider;
use serde::{Deserialize, Serialize};
use crate::preprocessor::media::MediaDecoder;
use crate::preprocessor::prompt::{OAIChatLikeRequest, TextInput};
use crate::protocols::openai::chat_completions::NvCreateChatCompletionRequest;
use crate::protocols::openai::common_ext::{CommonExt, CommonExtProvider};
use crate::protocols::openai::nvext::{CacheControl, NvExt, NvExtProvider};
use crate::protocols::openai::{
OpenAIOutputOptionsProvider, OpenAISamplingOptionsProvider, OpenAIStopConditionsProvider,
};
use dynamo_async_openai::types::responses::{IncludeEnum, Reasoning, Truncation};
use super::anthropic::types::{AnthropicCreateMessageRequest, ThinkingConfig};
use super::openai::responses::NvCreateResponse;
/// Identifies which API surface originated the request and carries
/// fields specific to that API that cannot be represented in the
/// OpenAI Chat Completions format.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ApiContext {
/// Request came from the OpenAI Chat Completions API.
/// All fields are natively represented in `NvCreateChatCompletionRequest`.
ChatCompletions,
/// Request came from the Anthropic Messages API.
Anthropic(AnthropicContext),
/// Request came from the OpenAI Responses API.
Responses(ResponsesContext),
}
/// Anthropic-specific fields preserved from `AnthropicCreateMessageRequest`.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct AnthropicContext {
/// Extended thinking configuration (`type` + `budget_tokens`).
/// Dropped during conversion because `NvCreateChatCompletionRequest` has
/// no equivalent — only `reasoning_effort` (a string) exists.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub thinking: Option<ThinkingConfig>,
/// Per-block cache control breakpoints with their position in the
/// message array. The existing Anthropic→Chat Completions conversion
/// collapses all per-block `cache_control` annotations into a single
/// last-one-wins `nvext.cache_control` field. This preserves the full
/// per-block granularity for future use (e.g., multi-breakpoint prefix
/// caching, or faithfully reporting per-breakpoint `cache_creation_input_tokens`
/// / `cache_read_input_tokens` in the response).
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub cache_breakpoints: Vec<CacheBreakpoint>,
/// When true, the model should not issue parallel tool calls.
/// The Anthropic API supports `disable_parallel_tool_use` on the tool_choice
/// object but there is no OpenAI equivalent field.
#[serde(default)]
pub disable_parallel_tool_use: bool,
/// Anthropic request metadata (e.g. `user_id`).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub metadata: Option<serde_json::Value>,
/// Service tier selection from the Anthropic request.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub service_tier: Option<String>,
/// Container identifier for stateful sandbox sessions.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub container: Option<String>,
/// Output configuration (effort level, JSON schema format).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub output_config: Option<serde_json::Value>,
}
/// Responses API-specific fields preserved from `NvCreateResponse`.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct ResponsesContext {
/// Conversation continuation identifier.
/// Dropped during conversion — no OpenAI Chat equivalent.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub previous_response_id: Option<String>,
/// Context truncation strategy.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub truncation: Option<Truncation>,
/// Reasoning configuration (effort + optional summary generation).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reasoning: Option<Reasoning>,
/// Output items to include in the response.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub include: Option<Vec<IncludeEnum>>,
/// Whether responses should be stored server-side.
#[serde(default)]
pub store: bool,
}
/// A cache breakpoint records the position (message index, block index)
/// and the cache control directive from the original Anthropic request.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheBreakpoint {
/// Index of the message in the original messages array.
pub message_index: usize,
/// Index of the content block within the message (0 for plain-text messages).
pub block_index: usize,
/// The cache control directive.
pub cache_control: CacheControl,
}
/// API-agnostic request wrapper that preserves the full context from any
/// API surface while remaining compatible with the existing preprocessor.
#[derive(Debug, Clone)]
pub struct UnifiedRequest {
/// The core request in OpenAI Chat Completions format.
/// This is what the preprocessor already knows how to handle.
pub inner: NvCreateChatCompletionRequest,
/// Which API surface originated this request, plus API-specific fields
/// that were dropped during conversion to `NvCreateChatCompletionRequest`.
pub api_context: ApiContext,
}
impl From<NvCreateChatCompletionRequest> for UnifiedRequest {
fn from(req: NvCreateChatCompletionRequest) -> Self {
Self {
inner: req,
api_context: ApiContext::ChatCompletions,
}
}
}
impl TryFrom<AnthropicCreateMessageRequest> for UnifiedRequest {
type Error = anyhow::Error;
fn try_from(req: AnthropicCreateMessageRequest) -> Result<Self, Self::Error> {
// Capture API-specific fields BEFORE the lossy conversion
let anthropic_ctx = AnthropicContext {
thinking: req.thinking.clone(),
cache_breakpoints: extract_cache_breakpoints(&req),
disable_parallel_tool_use: extract_disable_parallel_tool_use(&req),
metadata: req.metadata.clone(),
service_tier: req.service_tier.clone(),
container: req.container.clone(),
output_config: req.output_config.clone(),
};
// Perform the existing lossy conversion
let inner: NvCreateChatCompletionRequest = req.try_into()?;
Ok(Self {
inner,
api_context: ApiContext::Anthropic(anthropic_ctx),
})
}
}
impl TryFrom<NvCreateResponse> for UnifiedRequest {
type Error = anyhow::Error;
fn try_from(req: NvCreateResponse) -> Result<Self, Self::Error> {
// Capture API-specific fields BEFORE the lossy conversion
let responses_ctx = ResponsesContext {
previous_response_id: req.inner.previous_response_id.clone(),
truncation: req.inner.truncation,
reasoning: req.inner.reasoning.clone(),
include: req.inner.include.clone(),
store: req.inner.store.unwrap_or(false),
};
// Perform the existing lossy conversion
let inner: NvCreateChatCompletionRequest = req.try_into()?;
Ok(Self {
inner,
api_context: ApiContext::Responses(responses_ctx),
})
}
}
/// Walk the Anthropic message array and collect per-block cache_control
/// annotations with their (message_index, block_index) positions.
fn extract_cache_breakpoints(req: &AnthropicCreateMessageRequest) -> Vec<CacheBreakpoint> {
use super::anthropic::types::{AnthropicContentBlock, AnthropicMessageContent};
let mut breakpoints = Vec::new();
// System-level cache control
if let Some(system) = &req.system
&& let Some(cc) = &system.cache_control
{
breakpoints.push(CacheBreakpoint {
message_index: 0, // system is logically position 0
block_index: 0,
cache_control: cc.clone(),
});
}
let offset = if req.system.is_some() { 1 } else { 0 };
for (msg_idx, msg) in req.messages.iter().enumerate() {
if let AnthropicMessageContent::Blocks { content } = &msg.content {
for (block_idx, block) in content.iter().enumerate() {
let cc = match block {
AnthropicContentBlock::Text { cache_control, .. } => cache_control.as_ref(),
AnthropicContentBlock::ToolUse { cache_control, .. } => cache_control.as_ref(),
AnthropicContentBlock::ToolResult { cache_control, .. } => {
cache_control.as_ref()
}
AnthropicContentBlock::Thinking { cache_control, .. } => cache_control.as_ref(),
_ => None,
};
if let Some(cc) = cc {
breakpoints.push(CacheBreakpoint {
message_index: msg_idx + offset,
block_index: block_idx,
cache_control: cc.clone(),
});
}
}
}
}
breakpoints
}
/// Extract `disable_parallel_tool_use` from the Anthropic tool_choice.
/// The Anthropic API allows `{"type": "auto", "disable_parallel_tool_use": true}`
/// but there's no OpenAI Chat equivalent.
fn extract_disable_parallel_tool_use(req: &AnthropicCreateMessageRequest) -> bool {
use super::anthropic::types::AnthropicToolChoice;
match &req.tool_choice {
Some(AnthropicToolChoice::Simple(simple)) => {
simple.disable_parallel_tool_use.unwrap_or(false)
}
Some(AnthropicToolChoice::Named(named)) => named.disable_parallel_tool_use.unwrap_or(false),
None => false,
}
}
// Trait implementations — delegate to inner NvCreateChatCompletionRequest
impl NvExtProvider for UnifiedRequest {
fn nvext(&self) -> Option<&NvExt> {
self.inner.nvext.as_ref()
}
fn raw_prompt(&self) -> Option<String> {
None
}
/// Returns the single collapsed cache control from `nvext`. This is the
/// last-one-wins value produced by the Anthropic→Chat Completions conversion
/// and is sufficient for backends that support a single prefix cache boundary
/// (SGLang, vLLM). For per-block granularity, consult
/// `AnthropicContext::cache_breakpoints` via the `ApiContext` sidecar.
fn effective_cache_control(&self) -> Option<&CacheControl> {
NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
}
}
impl AnnotationsProvider for UnifiedRequest {
fn annotations(&self) -> Option<Vec<String>> {
self.inner
.nvext
.as_ref()
.and_then(|nvext| nvext.annotations.clone())
}
fn has_annotation(&self, annotation: &str) -> bool {
self.inner
.nvext
.as_ref()
.and_then(|nvext| nvext.annotations.as_ref())
.map(|annotations| annotations.contains(&annotation.to_string()))
.unwrap_or(false)
}
}
impl OpenAISamplingOptionsProvider for UnifiedRequest {
fn get_temperature(&self) -> Option<f32> {
self.inner.inner.temperature
}
fn get_top_p(&self) -> Option<f32> {
self.inner.inner.top_p
}
fn get_frequency_penalty(&self) -> Option<f32> {
self.inner.inner.frequency_penalty
}
fn get_presence_penalty(&self) -> Option<f32> {
self.inner.inner.presence_penalty
}
fn nvext(&self) -> Option<&NvExt> {
self.inner.nvext.as_ref()
}
fn get_seed(&self) -> Option<i64> {
self.inner.inner.seed
}
fn get_n(&self) -> Option<u8> {
self.inner.inner.n
}
fn get_best_of(&self) -> Option<u8> {
OpenAISamplingOptionsProvider::get_best_of(&self.inner)
}
}
impl CommonExtProvider for UnifiedRequest {
fn common_ext(&self) -> Option<&CommonExt> {
Some(&self.inner.common)
}
fn get_guided_json(&self) -> Option<serde_json::Value> {
// Delegate to the inner impl which handles tool_choice → guided_json
// and response_format → guided_json derivation.
CommonExtProvider::get_guided_json(&self.inner)
}
fn get_guided_regex(&self) -> Option<String> {
self.inner.common.guided_regex.clone()
}
fn get_guided_grammar(&self) -> Option<String> {
self.inner.common.guided_grammar.clone()
}
fn get_guided_choice(&self) -> Option<Vec<String>> {
self.inner.common.guided_choice.clone()
}
fn get_guided_decoding_backend(&self) -> Option<String> {
self.inner.common.guided_decoding_backend.clone()
}
fn get_guided_whitespace_pattern(&self) -> Option<String> {
self.inner.common.guided_whitespace_pattern.clone()
}
fn get_top_k(&self) -> Option<i32> {
self.inner.common.top_k
}
fn get_min_p(&self) -> Option<f32> {
self.inner.common.min_p
}
fn get_repetition_penalty(&self) -> Option<f32> {
self.inner.common.repetition_penalty
}
fn get_include_stop_str_in_output(&self) -> Option<bool> {
self.inner.common.include_stop_str_in_output
}
fn get_skip_special_tokens(&self) -> Option<bool> {
self.inner.common.skip_special_tokens
}
}
impl OpenAIStopConditionsProvider for UnifiedRequest {
#[allow(deprecated)]
fn get_max_tokens(&self) -> Option<u32> {
self.inner
.inner
.max_completion_tokens
.or(self.inner.inner.max_tokens)
}
fn get_min_tokens(&self) -> Option<u32> {
self.inner.common.min_tokens
}
fn get_stop(&self) -> Option<Vec<String>> {
self.inner.inner.stop.as_ref().map(|stop| match stop {
dynamo_async_openai::types::Stop::String(s) => vec![s.clone()],
dynamo_async_openai::types::Stop::StringArray(arr) => arr.clone(),
})
}
fn nvext(&self) -> Option<&NvExt> {
self.inner.nvext.as_ref()
}
fn get_common_ignore_eos(&self) -> Option<bool> {
self.inner.common.ignore_eos
}
}
impl OpenAIOutputOptionsProvider for UnifiedRequest {
fn get_logprobs(&self) -> Option<u32> {
match self.inner.inner.logprobs {
Some(true) => match self.inner.inner.top_logprobs {
Some(top_logprobs) => Some(top_logprobs as u32),
None => Some(1_u32),
},
Some(false) => None,
None => None,
}
}
fn get_prompt_logprobs(&self) -> Option<u32> {
OpenAIOutputOptionsProvider::get_prompt_logprobs(&self.inner)
}
fn get_skip_special_tokens(&self) -> Option<bool> {
OpenAIOutputOptionsProvider::get_skip_special_tokens(&self.inner)
}
fn get_formatted_prompt(&self) -> Option<bool> {
OpenAIOutputOptionsProvider::get_formatted_prompt(&self.inner)
}
}
impl OAIChatLikeRequest for UnifiedRequest {
fn model(&self) -> String {
self.inner.inner.model.clone()
}
fn messages(&self) -> minijinja::value::Value {
let messages_json = serde_json::to_value(&self.inner.inner.messages).unwrap();
minijinja::value::Value::from_serialize(&messages_json)
}
fn typed_messages(
&self,
) -> Option<&[dynamo_async_openai::types::ChatCompletionRequestMessage]> {
Some(self.inner.inner.messages.as_slice())
}
fn tools(&self) -> Option<minijinja::value::Value> {
OAIChatLikeRequest::tools(&self.inner)
}
fn tool_choice(&self) -> Option<minijinja::value::Value> {
OAIChatLikeRequest::tool_choice(&self.inner)
}
fn response_format(&self) -> Option<minijinja::value::Value> {
OAIChatLikeRequest::response_format(&self.inner)
}
fn should_add_generation_prompt(&self) -> bool {
OAIChatLikeRequest::should_add_generation_prompt(&self.inner)
}
fn extract_text(&self) -> Option<TextInput> {
OAIChatLikeRequest::extract_text(&self.inner)
}
fn chat_template_args(&self) -> Option<&HashMap<String, serde_json::Value>> {
self.inner.chat_template_args.as_ref()
}
fn media_io_kwargs(&self) -> Option<&MediaDecoder> {
self.inner.media_io_kwargs.as_ref()
}
}
impl UnifiedRequest {
/// Returns the Anthropic context if this request originated from the
/// Anthropic Messages API.
pub fn anthropic_context(&self) -> Option<&AnthropicContext> {
match &self.api_context {
ApiContext::Anthropic(ctx) => Some(ctx),
_ => None,
}
}
/// Returns the Responses context if this request originated from the
/// OpenAI Responses API.
pub fn responses_context(&self) -> Option<&ResponsesContext> {
match &self.api_context {
ApiContext::Responses(ctx) => Some(ctx),
_ => None,
}
}
/// Unwrap back to the inner `NvCreateChatCompletionRequest`.
/// Useful for gradual migration — callers that don't need the extra
/// context can unwrap and use the existing code paths unchanged.
pub fn into_inner(self) -> NvCreateChatCompletionRequest {
self.inner
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chat_completions_roundtrip() {
let req = NvCreateChatCompletionRequest {
inner: dynamo_async_openai::types::CreateChatCompletionRequest {
model: "test-model".to_string(),
messages: vec![],
..Default::default()
},
common: CommonExt::default(),
nvext: None,
chat_template_args: None,
media_io_kwargs: None,
unsupported_fields: Default::default(),
};
let unified = UnifiedRequest::from(req.clone());
assert!(matches!(unified.api_context, ApiContext::ChatCompletions));
assert_eq!(unified.model(), "test-model");
}
#[test]
fn test_anthropic_context_preserved() {
use super::super::anthropic::types::*;
let req = AnthropicCreateMessageRequest {
model: "claude-sonnet-4-20250514".to_string(),
max_tokens: 1024,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".to_string(),
},
}],
system: None,
temperature: Some(0.7),
top_p: None,
top_k: None,
stop_sequences: None,
stream: true,
metadata: Some(serde_json::json!({"user_id": "test"})),
tools: None,
tool_choice: None,
cache_control: None,
thinking: Some(ThinkingConfig {
thinking_type: "enabled".to_string(),
budget_tokens: Some(4096),
}),
service_tier: None,
container: None,
output_config: None,
};
let unified = UnifiedRequest::try_from(req).unwrap();
// Verify the context was preserved
let ctx = unified.anthropic_context().unwrap();
assert!(ctx.thinking.is_some());
assert_eq!(ctx.thinking.as_ref().unwrap().thinking_type, "enabled");
assert_eq!(ctx.thinking.as_ref().unwrap().budget_tokens, Some(4096));
assert!(ctx.metadata.is_some());
// Verify it still works as a preprocessor input
assert_eq!(unified.model(), "claude-sonnet-4-20250514");
assert!(unified.extract_text().is_some());
}
#[test]
fn test_responses_context_preserved() {
// Construct an NvCreateResponse via JSON to satisfy all required fields
let json = serde_json::json!({
"model": "gpt-4o",
"input": "What is the capital of France?",
"previous_response_id": "resp_abc123",
"store": true,
"truncation": "auto",
"reasoning": {
"effort": "medium"
},
"include": ["message.output_text.logprobs"]
});
let req: NvCreateResponse = serde_json::from_value(json).unwrap();
let unified = UnifiedRequest::try_from(req).unwrap();
let ctx = unified.responses_context().unwrap();
assert_eq!(ctx.previous_response_id.as_deref(), Some("resp_abc123"));
assert!(ctx.store);
assert!(ctx.truncation.is_some());
assert!(ctx.reasoning.is_some());
assert!(ctx.include.is_some());
assert_eq!(ctx.include.as_ref().unwrap().len(), 1);
// Verify it still works as a preprocessor input
assert_eq!(unified.model(), "gpt-4o");
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment