"lib/llm/src/vscode:/vscode.git/clone" did not exist on "5d90e530bc4ff683a779b2bc0b9237cfcc2504fd"
Unverified Commit b6603d90 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat: add Anthropic Messages API endpoint (/v1/messages) (#6231)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
Signed-off-by: default avatarMarko Kosec <mkosec@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent 858f33fc
......@@ -81,6 +81,7 @@ class FrontendConfig(ConfigBase):
request_plane: str
event_plane: str
chat_processor: str
enable_anthropic_api: bool
exp_python_factory: bool
def validate(self) -> None:
......@@ -492,6 +493,16 @@ class FrontendArgGroup(ArgGroup):
help="Determines how events are published [nats|zmq]",
choices=["nats", "zmq"],
)
add_negatable_bool_argument(
g,
flag_name="--enable-anthropic-api",
env_var="DYN_ENABLE_ANTHROPIC_API",
default=False,
help=(
"[EXPERIMENTAL] Enable Anthropic Messages API endpoint (/v1/messages). "
"This feature is experimental and may change."
),
)
add_argument(
g,
flag_name="--chat-processor",
......
......@@ -233,6 +233,9 @@ async def async_main():
if config.kserve_grpc_server and config.grpc_metrics_port:
kwargs["http_metrics_port"] = config.grpc_metrics_port
if config.enable_anthropic_api:
os.environ["DYN_ENABLE_ANTHROPIC_API"] = "1"
if config.chat_processor == "vllm":
assert (
vllm_flags is not None
......
......@@ -51,6 +51,7 @@ fn generate_openapi() -> anyhow::Result<()> {
.enable_cmpl_endpoints(true)
.enable_embeddings_endpoints(true)
.enable_responses_endpoints(true)
.enable_anthropic_endpoints(true)
.build()
.context("failed to build HttpService for OpenAPI generation")?;
......
......@@ -18,6 +18,8 @@ pub enum EndpointType {
Videos,
/// Responses API
Responses,
/// Anthropic Messages API
AnthropicMessages,
}
impl EndpointType {
......@@ -29,6 +31,7 @@ impl EndpointType {
Self::Images => "images",
Self::Videos => "videos",
Self::Responses => "responses",
Self::AnthropicMessages => "anthropic_messages",
}
}
......@@ -40,6 +43,7 @@ impl EndpointType {
Self::Images,
Self::Videos,
Self::Responses,
Self::AnthropicMessages,
]
}
}
......@@ -262,6 +262,7 @@ impl KserveServiceConfigBuilder {
.enable_cmpl_endpoints(false)
.enable_embeddings_endpoints(false)
.enable_responses_endpoints(false)
.enable_anthropic_endpoints(false)
.build()?;
// Share the HTTP service's model manager and metrics object with gRPC state
......
......@@ -18,6 +18,7 @@
//!
//! The [`service_v2::HttpService`] can be further extended to host any [`axum::Router`] using the [`service_v2::HttpServiceConfigBuilder`].
mod anthropic;
mod openai;
pub mod busy_threshold;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! HTTP handler for the Anthropic Messages API (`/v1/messages`).
//!
//! This is a translation layer: incoming Anthropic requests are converted to
//! chat completions, processed by the existing engine, and responses/streams
//! are converted back to Anthropic format.
use std::sync::Arc;
use axum::{
Json, Router,
body::Body,
extract::State,
http::{HeaderMap, Request, StatusCode},
middleware::{self, Next},
response::{
IntoResponse, Response,
sse::{KeepAlive, Sse},
},
routing::post,
};
use dynamo_runtime::pipeline::{AsyncEngineContextProvider, Context};
use futures::{StreamExt, stream};
use tracing::Instrument;
use super::{
RouteDoc,
disconnect::{ConnectionHandle, create_connection_monitor, monitor_for_disconnects},
metrics::{Endpoint, process_response_and_observe_metrics},
service_v2,
};
use crate::protocols::anthropic::stream_converter::AnthropicStreamConverter;
use crate::protocols::anthropic::types::{
AnthropicCountTokensRequest, AnthropicCountTokensResponse, AnthropicCreateMessageRequest,
AnthropicErrorBody, AnthropicErrorResponse, chat_completion_to_anthropic_response,
};
use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
aggregator::ChatCompletionAggregator,
};
use crate::request_template::RequestTemplate;
// Re-use helpers from the openai module (sibling under service/)
use super::openai::{get_body_limit, get_or_create_request_id};
// ---------------------------------------------------------------------------
// Router
// ---------------------------------------------------------------------------
/// Creates the router for the `/v1/messages` and `/v1/messages/count_tokens` endpoints.
pub fn anthropic_messages_router(
state: Arc<service_v2::State>,
template: Option<RequestTemplate>,
path: Option<String>,
) -> (Vec<RouteDoc>, Router) {
let path = path.unwrap_or("/v1/messages".to_string());
let count_tokens_path = format!("{}/count_tokens", &path);
let doc = RouteDoc::new(axum::http::Method::POST, &path);
let count_doc = RouteDoc::new(axum::http::Method::POST, &count_tokens_path);
let router = Router::new()
.route(&path, post(handler_anthropic_messages))
.route(&count_tokens_path, post(handler_count_tokens))
.layer(middleware::from_fn(anthropic_error_middleware))
.layer(axum::extract::DefaultBodyLimit::max(get_body_limit()))
.with_state((state, template));
(vec![doc, count_doc], router)
}
// ---------------------------------------------------------------------------
// Error middleware
// ---------------------------------------------------------------------------
/// Converts 422 validation errors to Anthropic error format.
async fn anthropic_error_middleware(request: Request<Body>, next: Next) -> Response {
let response = next.run(request).await;
if response.status() == StatusCode::UNPROCESSABLE_ENTITY {
let (_parts, body) = response.into_parts();
let body_bytes = axum::body::to_bytes(body, get_body_limit())
.await
.unwrap_or_default();
let error_message = String::from_utf8_lossy(&body_bytes).to_string();
return anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
&error_message,
);
}
response
}
// ---------------------------------------------------------------------------
// Handlers
// ---------------------------------------------------------------------------
/// Top-level HTTP handler for POST /v1/messages.
async fn handler_anthropic_messages(
State((state, template)): State<(Arc<service_v2::State>, Option<RequestTemplate>)>,
headers: HeaderMap,
Json(request): Json<AnthropicCreateMessageRequest>,
) -> Result<Response, Response> {
// Validate required fields
if request.messages.is_empty() {
return Err(anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
"messages: field required",
));
}
if request.max_tokens == 0 {
return Err(anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
"max_tokens: must be greater than 0",
));
}
// Create request context
let request_id = get_or_create_request_id(None, &headers);
let request = Context::with_id(request, request_id);
let context = request.context();
// Create connection handles
let (mut connection_handle, stream_handle) =
create_connection_monitor(context.clone(), Some(state.metrics_clone())).await;
let response =
tokio::spawn(anthropic_messages(state, template, request, stream_handle).in_current_span())
.await
.map_err(|e| {
anthropic_error(
StatusCode::INTERNAL_SERVER_ERROR,
"api_error",
&format!("Failed to await messages task: {:?}", e),
)
})?;
connection_handle.disarm();
response
}
/// Core logic for the Anthropic Messages endpoint.
#[tracing::instrument(level = "debug", skip_all, fields(request_id = %request.id()))]
async fn anthropic_messages(
state: Arc<service_v2::State>,
template: Option<RequestTemplate>,
mut request: Context<AnthropicCreateMessageRequest>,
mut stream_handle: ConnectionHandle,
) -> Result<Response, Response> {
let streaming = request.stream;
let request_id = request.id().to_string();
// Apply template defaults before capturing model (must happen first so
// engine lookup and metrics use the resolved model name).
if let Some(template) = template {
if request.model.is_empty() {
request.model = template.model.clone();
}
if request.temperature.is_none() {
request.temperature = Some(template.temperature);
}
if request.max_tokens == 0 {
request.max_tokens = template.max_completion_tokens;
}
}
let model = request.model.clone();
let http_queue_guard = state.metrics_clone().create_http_queue_guard(&model);
tracing::trace!("Received Anthropic messages request: {:?}", &*request);
let (orig_request, context) = request.into_parts();
let model_for_resp = orig_request.model.clone();
// Convert Anthropic request -> Chat Completion request
let chat_request: NvCreateChatCompletionRequest =
orig_request.try_into().map_err(|e: anyhow::Error| {
tracing::error!(
request_id,
error = %e,
"Failed to convert AnthropicCreateMessageRequest to NvCreateChatCompletionRequest",
);
anthropic_error(
StatusCode::BAD_REQUEST,
"invalid_request_error",
&format!("Failed to convert request: {}", e),
)
})?;
let request = context.map(|_req| chat_request);
tracing::trace!("Getting chat completions engine for model: {}", model);
let engine = state
.manager()
.get_chat_completions_engine(&model)
.map_err(|_| {
anthropic_error(
StatusCode::NOT_FOUND,
"not_found_error",
&format!("Model '{}' not found", model),
)
})?;
let parsing_options = state.manager().get_parsing_options(&model);
let mut response_collector = state.metrics_clone().create_response_collector(&model);
tracing::trace!("Issuing generate call for Anthropic messages");
let engine_stream = engine.generate(request).await.map_err(|e| {
anthropic_error(
StatusCode::INTERNAL_SERVER_ERROR,
"api_error",
&format!("Failed to generate completions: {}", e),
)
})?;
let ctx = engine_stream.context();
let mut inflight_guard =
state
.metrics_clone()
.create_inflight_guard(&model, Endpoint::AnthropicMessages, streaming);
if streaming {
stream_handle.arm();
use std::sync::atomic::{AtomicBool, Ordering};
let mut converter = AnthropicStreamConverter::new(model_for_resp);
let start_events = converter.emit_start_events();
let converter = std::sync::Arc::new(std::sync::Mutex::new(converter));
let converter_end = converter.clone();
let saw_error = std::sync::Arc::new(AtomicBool::new(false));
let saw_error_end = saw_error.clone();
let mut http_queue_guard = Some(http_queue_guard);
let event_stream = engine_stream
.inspect(move |response| {
process_response_and_observe_metrics(
response,
&mut response_collector,
&mut http_queue_guard,
);
})
.filter_map(move |annotated_chunk| {
let converter = converter.clone();
let saw_error = saw_error.clone();
async move {
if annotated_chunk.data.is_none() {
if annotated_chunk.event.as_deref() == Some("error") {
saw_error.store(true, Ordering::Release);
}
return None;
}
let stream_resp = annotated_chunk.data?;
let mut conv = converter.lock().expect("converter lock poisoned");
let events = conv.process_chunk(&stream_resp);
Some(stream::iter(events))
}
})
.flatten();
let start_stream = stream::iter(start_events);
let done_stream = stream::once(async move {
let mut conv = converter_end.lock().expect("converter lock poisoned");
let end_events = if saw_error_end.load(Ordering::Acquire) {
conv.emit_error_events()
} else {
conv.emit_end_events()
};
stream::iter(end_events)
})
.flatten();
let full_stream = start_stream.chain(event_stream).chain(done_stream);
let full_stream = full_stream.map(|result| result.map_err(axum::Error::new));
let stream = monitor_for_disconnects(full_stream, ctx, inflight_guard, stream_handle);
let mut sse_stream = Sse::new(stream);
if let Some(keep_alive) = state.sse_keep_alive() {
sse_stream = sse_stream.keep_alive(KeepAlive::default().interval(keep_alive));
}
Ok(sse_stream.into_response())
} else {
// Non-streaming path: aggregate stream into single response
// Check first event for backend errors using the openai helper
let stream_with_check = super::openai::check_for_backend_error(engine_stream)
.await
.map_err(|(status, json_err)| {
tracing::error!(request_id, %status, ?json_err, "Backend error detected");
anthropic_error(
StatusCode::INTERNAL_SERVER_ERROR,
"api_error",
"Backend error during generation",
)
})?;
let mut http_queue_guard = Some(http_queue_guard);
let stream = stream_with_check.inspect(move |response| {
process_response_and_observe_metrics(
response,
&mut response_collector,
&mut http_queue_guard,
);
});
let chat_response =
NvCreateChatCompletionResponse::from_annotated_stream(stream, parsing_options.clone())
.await
.map_err(|e| {
tracing::error!(request_id, "Failed to fold messages stream: {:?}", e);
anthropic_error(
StatusCode::INTERNAL_SERVER_ERROR,
"api_error",
&format!("Failed to fold messages stream: {}", e),
)
})?;
let response = chat_completion_to_anthropic_response(chat_response, &model_for_resp);
inflight_guard.mark_ok();
Ok(Json(response).into_response())
}
}
// ---------------------------------------------------------------------------
// Count tokens
// ---------------------------------------------------------------------------
/// Handler for POST /v1/messages/count_tokens.
/// Returns an estimated input token count using a len/3 heuristic.
async fn handler_count_tokens(
State((_state, _template)): State<(Arc<service_v2::State>, Option<RequestTemplate>)>,
Json(request): Json<AnthropicCountTokensRequest>,
) -> Result<Response, Response> {
let tokens = request.estimate_tokens();
Ok(Json(AnthropicCountTokensResponse {
input_tokens: tokens,
})
.into_response())
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/// Build an Anthropic-formatted error response.
/// Maps HTTP status codes to Anthropic error types following the Anthropic API spec.
fn anthropic_error(status: StatusCode, error_type: &str, message: &str) -> Response {
let mapped_type = match status.as_u16() {
400 => "invalid_request_error",
401 => "authentication_error",
403 => "permission_error",
404 => "not_found_error",
429 => "rate_limit_error",
503 | 529 => "overloaded_error",
// Use the caller-provided type for other codes (e.g. 500 → "api_error")
_ => error_type,
};
(
status,
Json(AnthropicErrorResponse {
object_type: "error".to_string(),
error: AnthropicErrorBody {
error_type: mapped_type.to_string(),
message: message.to_string(),
},
}),
)
.into_response()
}
......@@ -297,6 +297,9 @@ pub enum Endpoint {
/// OAI Responses
Responses,
/// Anthropic Messages
AnthropicMessages,
/// Tensor
Tensor,
}
......@@ -948,6 +951,7 @@ impl std::fmt::Display for Endpoint {
Endpoint::Images => write!(f, "images"),
Endpoint::Videos => write!(f, "videos"),
Endpoint::Responses => write!(f, "responses"),
Endpoint::AnthropicMessages => write!(f, "anthropic_messages"),
Endpoint::Tensor => write!(f, "tensor"),
}
}
......@@ -962,6 +966,7 @@ impl Endpoint {
Endpoint::Images => "images",
Endpoint::Videos => "videos",
Endpoint::Responses => "responses",
Endpoint::AnthropicMessages => "anthropic_messages",
Endpoint::Tensor => "tensor",
}
}
......
......@@ -68,7 +68,7 @@ const VALIDATION_PREFIX: &str = "Validation: ";
// Default axum max body limit without configuring is 2MB: https://docs.rs/axum/latest/axum/extract/struct.DefaultBodyLimit.html
/// Default body limit in bytes (45MB) to support 500k+ token payloads.
/// Can be configured at compile time using the DYN_FRONTEND_BODY_LIMIT_MB environment variable
fn get_body_limit() -> usize {
pub(super) fn get_body_limit() -> usize {
std::env::var(env_llm::DYN_HTTP_BODY_LIMIT_MB)
.ok()
.and_then(|s| s.parse::<usize>().ok())
......@@ -248,7 +248,7 @@ pub async fn smart_json_error_middleware(request: Request<Body>, next: Next) ->
/// Get the request ID from a primary source, or next from the headers, or lastly create a new one if not present
// TODO: Similar function exists in lib/llm/src/grpc/service/openai.rs but with different signature and simpler logic
fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> String {
pub(super) fn get_or_create_request_id(primary: Option<&str>, headers: &HeaderMap) -> String {
// Try to get request id from trace context
if let Some(trace_context) = get_distributed_tracing_context()
&& let Some(x_dynamo_request_id) = trace_context.x_dynamo_request_id
......@@ -821,7 +821,7 @@ fn extract_backend_error_if_present<T: serde::Serialize>(
/// Checks if the first event in the stream is a backend error.
/// Returns Err(ErrorResponse) if error detected, Ok(stream) otherwise.
async fn check_for_backend_error(
pub(super) async fn check_for_backend_error(
mut stream: impl futures::Stream<Item = Annotated<NvCreateChatCompletionStreamResponse>>
+ Send
+ Unpin
......
......@@ -23,6 +23,7 @@ use crate::request_template::RequestTemplate;
use anyhow::Result;
use axum_server::tls_rustls::RustlsConfig;
use derive_builder::Builder;
use dynamo_runtime::config::env_is_truthy;
use dynamo_runtime::config::environment_names::llm as env_llm;
use dynamo_runtime::discovery::Discovery;
use dynamo_runtime::logging::make_request_span;
......@@ -48,6 +49,7 @@ struct StateFlags {
images_endpoints_enabled: AtomicBool,
videos_endpoints_enabled: AtomicBool,
responses_endpoints_enabled: AtomicBool,
anthropic_endpoints_enabled: AtomicBool,
}
impl StateFlags {
......@@ -59,6 +61,9 @@ impl StateFlags {
EndpointType::Images => self.images_endpoints_enabled.load(Ordering::Relaxed),
EndpointType::Videos => self.videos_endpoints_enabled.load(Ordering::Relaxed),
EndpointType::Responses => self.responses_endpoints_enabled.load(Ordering::Relaxed),
EndpointType::AnthropicMessages => {
self.anthropic_endpoints_enabled.load(Ordering::Relaxed)
}
}
}
......@@ -82,6 +87,9 @@ impl StateFlags {
EndpointType::Responses => self
.responses_endpoints_enabled
.store(enabled, Ordering::Relaxed),
EndpointType::AnthropicMessages => self
.anthropic_endpoints_enabled
.store(enabled, Ordering::Relaxed),
}
}
}
......@@ -103,6 +111,7 @@ impl State {
images_endpoints_enabled: AtomicBool::new(false),
videos_endpoints_enabled: AtomicBool::new(false),
responses_endpoints_enabled: AtomicBool::new(false),
anthropic_endpoints_enabled: AtomicBool::new(false),
},
cancel_token,
}
......@@ -187,6 +196,9 @@ pub struct HttpServiceConfig {
#[builder(default = "true")]
enable_responses_endpoints: bool,
#[builder(default = "false")]
enable_anthropic_endpoints: bool,
#[builder(default = "None")]
request_template: Option<RequestTemplate>,
......@@ -345,6 +357,8 @@ static HTTP_SVC_CMP_PATH_ENV: &str = "DYN_HTTP_SVC_CMP_PATH";
static HTTP_SVC_EMB_PATH_ENV: &str = "DYN_HTTP_SVC_EMB_PATH";
/// Environment variable to set the responses endpoint path (default: `/v1/responses`)
static HTTP_SVC_RESPONSES_PATH_ENV: &str = "DYN_HTTP_SVC_RESPONSES_PATH";
/// Environment variable to set the anthropic messages endpoint path (default: `/v1/messages`)
static HTTP_SVC_ANTHROPIC_PATH_ENV: &str = "DYN_HTTP_SVC_ANTHROPIC_PATH";
impl HttpServiceConfigBuilder {
pub fn build(self) -> Result<HttpService, anyhow::Error> {
......@@ -379,6 +393,10 @@ impl HttpServiceConfigBuilder {
state
.flags
.set(&EndpointType::Responses, config.enable_responses_endpoints);
state.flags.set(
&EndpointType::AnthropicMessages,
config.enable_anthropic_endpoints,
);
// enable prometheus metrics
let registry = metrics::Registry::new();
......@@ -501,7 +519,6 @@ impl HttpServiceConfigBuilder {
request_template.clone(),
var(HTTP_SVC_RESPONSES_PATH_ENV).ok(),
);
let mut endpoint_routes = HashMap::new();
endpoint_routes.insert(EndpointType::Chat, (chat_docs, chat_route));
endpoint_routes.insert(EndpointType::Completion, (cmpl_docs, cmpl_route));
......@@ -510,6 +527,19 @@ impl HttpServiceConfigBuilder {
endpoint_routes.insert(EndpointType::Videos, (videos_docs, videos_route));
endpoint_routes.insert(EndpointType::Responses, (responses_docs, responses_route));
if env_is_truthy(env_llm::DYN_ENABLE_ANTHROPIC_API) {
tracing::warn!("Anthropic Messages API (/v1/messages) is experimental.");
let (anthropic_docs, anthropic_route) = super::anthropic::anthropic_messages_router(
state.clone(),
request_template.clone(),
var(HTTP_SVC_ANTHROPIC_PATH_ENV).ok(),
);
endpoint_routes.insert(
EndpointType::AnthropicMessages,
(anthropic_docs, anthropic_route),
);
}
for endpoint_type in EndpointType::all() {
let state_route = state.clone();
if !endpoint_routes.contains_key(&endpoint_type) {
......
......@@ -129,6 +129,14 @@ impl ModelType {
let mut endpoint_types = Vec::new();
if self.contains(Self::Chat) {
endpoint_types.push(crate::endpoint_type::EndpointType::Chat);
// Translation layers over chat completions
endpoint_types.push(crate::endpoint_type::EndpointType::Responses);
// AnthropicMessages is gated by DYN_ENABLE_ANTHROPIC_API env var
if dynamo_runtime::config::env_is_truthy(
dynamo_runtime::config::environment_names::llm::DYN_ENABLE_ANTHROPIC_API,
) {
endpoint_types.push(crate::endpoint_type::EndpointType::AnthropicMessages);
}
}
if self.contains(Self::Completions) {
endpoint_types.push(crate::endpoint_type::EndpointType::Completion);
......
......@@ -10,6 +10,7 @@
use futures::{Stream, StreamExt};
use serde::{Deserialize, Serialize};
pub mod anthropic;
pub mod codec;
pub mod common;
pub mod openai;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Anthropic Messages API protocol types and conversion logic.
//!
//! This module provides types for the Anthropic Messages API (`/v1/messages`)
//! and conversion logic to/from the internal chat completions representation.
pub mod stream_converter;
pub mod types;
pub use types::*;
This diff is collapsed.
This diff is collapsed.
......@@ -216,6 +216,7 @@ fn compute_index(endpoint: &Endpoint, request_type: &RequestType, status: &Statu
Endpoint::ChatCompletions => 1,
Endpoint::Embeddings => todo!(),
Endpoint::Responses => todo!(),
Endpoint::AnthropicMessages => todo!(),
Endpoint::Tensor => todo!(),
Endpoint::Images => todo!(),
Endpoint::Videos => todo!(),
......
......@@ -270,6 +270,9 @@ pub mod llm {
/// LoRA cache directory path
pub const DYN_LORA_PATH: &str = "DYN_LORA_PATH";
/// Enable the experimental Anthropic Messages API endpoint (/v1/messages)
pub const DYN_ENABLE_ANTHROPIC_API: &str = "DYN_ENABLE_ANTHROPIC_API";
/// Metrics configuration
pub mod metrics {
/// Custom metrics prefix (overrides default "dynamo_frontend")
......@@ -446,6 +449,7 @@ mod tests {
llm::DYN_HTTP_BODY_LIMIT_MB,
llm::DYN_LORA_ENABLED,
llm::DYN_LORA_PATH,
llm::DYN_ENABLE_ANTHROPIC_API,
llm::metrics::DYN_METRICS_PREFIX,
// Model
model::model_express::MODEL_EXPRESS_URL,
......
......@@ -17,6 +17,8 @@ from tests.serve.common import (
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
anthropic_messages_payload_default,
anthropic_messages_stream_payload_default,
chat_payload,
chat_payload_default,
completion_payload_default,
......@@ -288,6 +290,23 @@ sglang_configs = {
completion_payload_default(),
],
),
"anthropic_messages": SGLangConfig(
name="anthropic_messages",
directory=sglang_dir,
script_name="agg.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.post_merge,
pytest.mark.timeout(240),
],
model="Qwen/Qwen3-0.6B",
env={"DYN_ENABLE_ANTHROPIC_API": "1"},
frontend_port=DefaultPort.FRONTEND.value,
request_payloads=[
anthropic_messages_payload_default(),
anthropic_messages_stream_payload_default(),
],
),
}
......
......@@ -6,6 +6,8 @@ from typing import Any, Dict, List, Optional, Union
from tests.utils.client import send_request
from tests.utils.constants import DefaultPort
from tests.utils.payloads import (
AnthropicMessagesPayload,
AnthropicMessagesStreamPayload,
CachedTokensChatPayload,
ChatPayload,
ChatPayloadWithLogprobs,
......@@ -531,3 +533,56 @@ def responses_stream_payload_default(
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
)
def anthropic_messages_payload_default(
repeat_count: int = 1,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 200,
temperature: float = 0.0,
) -> AnthropicMessagesPayload:
"""Create a default Anthropic Messages API payload (non-streaming)."""
return AnthropicMessagesPayload(
body={
"max_tokens": max_tokens,
"messages": [
{
"role": "user",
"content": TEXT_PROMPT,
}
],
"temperature": temperature,
},
repeat_count=repeat_count,
expected_log=expected_log or [],
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
)
def anthropic_messages_stream_payload_default(
repeat_count: int = 1,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 200,
temperature: float = 0.0,
) -> AnthropicMessagesStreamPayload:
"""Create a default Anthropic Messages API streaming payload."""
return AnthropicMessagesStreamPayload(
body={
"max_tokens": max_tokens,
"messages": [
{
"role": "user",
"content": TEXT_PROMPT,
}
],
"stream": True,
"temperature": temperature,
},
repeat_count=repeat_count,
expected_log=expected_log or [],
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
)
......@@ -611,6 +611,131 @@ class ResponsesStreamPayload(BasePayload):
return ResponsesStreamPayload.extract_content(response)
@dataclass
class AnthropicMessagesPayload(BasePayload):
"""Payload for the Anthropic Messages API endpoint (/v1/messages)."""
endpoint: str = "/v1/messages"
@staticmethod
def extract_content(response):
"""Extract text content from an Anthropic Messages API response."""
response.raise_for_status()
result = response.json()
assert (
result.get("type") == "message"
), f"Expected type='message', got {result.get('type')}"
assert result.get("id", "").startswith(
"msg_"
), f"Expected id to start with 'msg_', got {result.get('id')}"
assert (
result.get("role") == "assistant"
), f"Expected role='assistant', got {result.get('role')}"
assert result.get("stop_reason") in (
"end_turn",
"max_tokens",
"stop_sequence",
"tool_use",
), f"Unexpected stop_reason: {result.get('stop_reason')}"
content = result.get("content", [])
assert len(content) > 0, "Response content is empty"
assert (
content[0].get("type") == "text"
), f"Expected content[0].type='text', got {content[0].get('type')}"
usage = result.get("usage", {})
assert "input_tokens" in usage, "Missing input_tokens in usage"
assert "output_tokens" in usage, "Missing output_tokens in usage"
return content[0].get("text", "")
def response_handler(self, response: Any) -> str:
return AnthropicMessagesPayload.extract_content(response)
@dataclass
class AnthropicMessagesStreamPayload(BasePayload):
"""Streaming payload for the Anthropic Messages API endpoint (/v1/messages).
Validates SSE event structure and lifecycle ordering per the Anthropic streaming spec.
"""
endpoint: str = "/v1/messages"
http_stream: bool = True
@staticmethod
def extract_content(response):
"""Parse SSE stream and validate Anthropic event structure."""
import json
response.raise_for_status()
events = []
event_type = ""
for line in response.iter_lines(decode_unicode=True):
if not line:
continue
if line.startswith("event: "):
event_type = line[len("event: ") :]
elif line.startswith("data: "):
data_str = line[len("data: ") :]
events.append((event_type, json.loads(data_str)))
event_types = [e[0] for e in events]
# Validate lifecycle event ordering
assert len(event_types) >= 3, f"Too few events: {event_types}"
assert (
event_types[0] == "message_start"
), f"First event should be message_start, got {event_types[0]}"
assert (
event_types[-1] == "message_stop"
), f"Last event should be message_stop, got {event_types[-1]}"
# Validate message_start structure
msg_start = events[0][1]
assert msg_start.get("type") == "message_start", "message_start missing type"
message = msg_start.get("message", {})
assert message.get("id", "").startswith(
"msg_"
), "message id should start with msg_"
assert message.get("role") == "assistant", "message role should be assistant"
# Validate required event types
assert "content_block_start" in event_types, "Missing content_block_start"
assert "content_block_delta" in event_types, "Missing content_block_delta"
assert "content_block_stop" in event_types, "Missing content_block_stop"
assert "message_delta" in event_types, "Missing message_delta"
# Validate message_delta has stop_reason
delta_events = [e for e in events if e[0] == "message_delta"]
assert (
len(delta_events) == 1
), f"Expected 1 message_delta, got {len(delta_events)}"
delta_body = delta_events[0][1].get("delta", {})
assert delta_body.get("stop_reason") in (
"end_turn",
"max_tokens",
"stop_sequence",
"tool_use",
), f"Unexpected stop_reason in message_delta: {delta_body.get('stop_reason')}"
# Collect text deltas
deltas = []
for e_type, e_data in events:
if e_type == "content_block_delta":
delta = e_data.get("delta", {})
if delta.get("type") == "text_delta":
deltas.append(delta.get("text", ""))
return "".join(deltas)
def response_handler(self, response: Any) -> str:
return AnthropicMessagesStreamPayload.extract_content(response)
@dataclass
class EmbeddingPayload(BasePayload):
"""Payload for embeddings endpoint."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment