use serde::{Deserialize, Serialize}; use serde_json::Value; use std::collections::HashMap; // # Protocol Specifications // // This module contains all protocol definitions for OpenAI and SGLang APIs. // // ## Table of Contents // // 1. **OPENAI SPEC - Chat Completions API** // - Message Types // - Response Format Types // - Tool/Function Types // - Streaming Delta Types // - Request/Response structures // // 2. **OPENAI SPEC - Completions API** // - Request/Response structures // - Streaming support // // 3. **OPENAI SPEC - Responses API** // - Tool Definitions // - Reasoning Configuration // - Input/Output Items // - Service Tier & Tool Choice // - Request/Response structures // // 4. **OPENAI SPEC - Common** // - Shared Request Components // - Tool Choice Types // - Usage Tracking // - Logprobs Types // - Error Response Types // // 5. **SGLANG SPEC - GENERATE API** // - Generate Parameters // - Sampling Parameters // - Request/Response structures // // 6. **COMMON** // - GenerationRequest trait // - StringOrArray & LoRAPath types // - Helper functions // ================================================================== // = OPENAI SPEC - Chat Completions API = // ================================================================== // ============= Message Types ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum ChatMessage { System { role: String, content: String, #[serde(skip_serializing_if = "Option::is_none")] name: Option, }, User { role: String, // "user" content: UserMessageContent, #[serde(skip_serializing_if = "Option::is_none")] name: Option, }, Assistant { role: String, // "assistant" #[serde(skip_serializing_if = "Option::is_none")] content: Option, #[serde(skip_serializing_if = "Option::is_none")] name: Option, #[serde(skip_serializing_if = "Option::is_none")] tool_calls: Option>, #[serde(skip_serializing_if = "Option::is_none")] function_call: Option, /// Reasoning content for O1-style models (SGLang extension) #[serde(skip_serializing_if = "Option::is_none")] reasoning_content: Option, }, Tool { role: String, // "tool" content: String, tool_call_id: String, }, Function { role: String, // "function" content: String, name: String, }, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum UserMessageContent { Text(String), Parts(Vec), } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] pub enum ContentPart { #[serde(rename = "text")] Text { text: String }, #[serde(rename = "image_url")] ImageUrl { image_url: ImageUrl }, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ImageUrl { pub url: String, #[serde(skip_serializing_if = "Option::is_none")] pub detail: Option, // "auto", "low", or "high" } // ============= Response Format Types ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] pub enum ResponseFormat { #[serde(rename = "text")] Text, #[serde(rename = "json_object")] JsonObject, #[serde(rename = "json_schema")] JsonSchema { json_schema: JsonSchemaFormat }, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct JsonSchemaFormat { pub name: String, pub schema: Value, #[serde(skip_serializing_if = "Option::is_none")] pub strict: Option, } // ============= Streaming Delta Types ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatMessageDelta { #[serde(skip_serializing_if = "Option::is_none")] pub role: Option, #[serde(skip_serializing_if = "Option::is_none")] pub content: Option, #[serde(skip_serializing_if = "Option::is_none")] pub tool_calls: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub function_call: Option, /// Reasoning content delta for O1-style models (SGLang extension) #[serde(skip_serializing_if = "Option::is_none")] pub reasoning_content: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ToolCallDelta { pub index: u32, #[serde(skip_serializing_if = "Option::is_none")] pub id: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(rename = "type")] pub tool_type: Option, #[serde(skip_serializing_if = "Option::is_none")] pub function: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct FunctionCallDelta { #[serde(skip_serializing_if = "Option::is_none")] pub name: Option, #[serde(skip_serializing_if = "Option::is_none")] pub arguments: Option, } // ============= Request ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatCompletionRequest { /// ID of the model to use pub model: String, /// A list of messages comprising the conversation so far pub messages: Vec, /// What sampling temperature to use, between 0 and 2 #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, /// An alternative to sampling with temperature #[serde(skip_serializing_if = "Option::is_none")] pub top_p: Option, /// How many chat completion choices to generate for each input message #[serde(skip_serializing_if = "Option::is_none")] pub n: Option, /// If set, partial message deltas will be sent #[serde(default)] pub stream: bool, /// Options for streaming response #[serde(skip_serializing_if = "Option::is_none")] pub stream_options: Option, /// Up to 4 sequences where the API will stop generating further tokens #[serde(skip_serializing_if = "Option::is_none")] pub stop: Option, /// The maximum number of tokens to generate #[serde(skip_serializing_if = "Option::is_none")] pub max_tokens: Option, /// An upper bound for the number of tokens that can be generated for a completion #[serde(skip_serializing_if = "Option::is_none")] pub max_completion_tokens: Option, /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far #[serde(skip_serializing_if = "Option::is_none")] pub presence_penalty: Option, /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far #[serde(skip_serializing_if = "Option::is_none")] pub frequency_penalty: Option, /// Modify the likelihood of specified tokens appearing in the completion #[serde(skip_serializing_if = "Option::is_none")] pub logit_bias: Option>, /// A unique identifier representing your end-user #[serde(skip_serializing_if = "Option::is_none")] pub user: Option, /// If specified, our system will make a best effort to sample deterministically #[serde(skip_serializing_if = "Option::is_none")] pub seed: Option, /// Whether to return log probabilities of the output tokens #[serde(default)] pub logprobs: bool, /// An integer between 0 and 20 specifying the number of most likely tokens to return #[serde(skip_serializing_if = "Option::is_none")] pub top_logprobs: Option, /// An object specifying the format that the model must output #[serde(skip_serializing_if = "Option::is_none")] pub response_format: Option, /// A list of tools the model may call #[serde(skip_serializing_if = "Option::is_none")] pub tools: Option>, /// Controls which (if any) tool is called by the model #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, /// Whether to enable parallel function calling during tool use #[serde(skip_serializing_if = "Option::is_none")] pub parallel_tool_calls: Option, /// Deprecated: use tools instead #[serde(skip_serializing_if = "Option::is_none")] pub functions: Option>, /// Deprecated: use tool_choice instead #[serde(skip_serializing_if = "Option::is_none")] pub function_call: Option, // ============= SGLang Extensions ============= /// Top-k sampling parameter (-1 to disable) #[serde(skip_serializing_if = "Option::is_none")] pub top_k: Option, /// Min-p nucleus sampling parameter #[serde(skip_serializing_if = "Option::is_none")] pub min_p: Option, /// Minimum number of tokens to generate #[serde(skip_serializing_if = "Option::is_none")] pub min_tokens: Option, /// Repetition penalty for reducing repetitive text #[serde(skip_serializing_if = "Option::is_none")] pub repetition_penalty: Option, /// Regex constraint for output generation #[serde(skip_serializing_if = "Option::is_none")] pub regex: Option, /// EBNF grammar constraint for structured output #[serde(skip_serializing_if = "Option::is_none")] pub ebnf: Option, /// Specific token IDs to use as stop conditions #[serde(skip_serializing_if = "Option::is_none")] pub stop_token_ids: Option>, /// Skip trimming stop tokens from output #[serde(default)] pub no_stop_trim: bool, /// Ignore end-of-sequence tokens during generation #[serde(default)] pub ignore_eos: bool, /// Continue generating from final assistant message #[serde(default)] pub continue_final_message: bool, /// Skip special tokens during detokenization #[serde(default = "default_true")] pub skip_special_tokens: bool, // ============= SGLang Extensions ============= /// Path to LoRA adapter(s) for model customization #[serde(skip_serializing_if = "Option::is_none")] pub lora_path: Option, /// Session parameters for continual prompting #[serde(skip_serializing_if = "Option::is_none")] pub session_params: Option>, /// Separate reasoning content from final answer (O1-style models) #[serde(default = "default_true")] pub separate_reasoning: bool, /// Stream reasoning tokens during generation #[serde(default = "default_true")] pub stream_reasoning: bool, /// Return model hidden states #[serde(default)] pub return_hidden_states: bool, } impl GenerationRequest for ChatCompletionRequest { fn is_stream(&self) -> bool { self.stream } fn get_model(&self) -> Option<&str> { Some(&self.model) } fn extract_text_for_routing(&self) -> String { // Extract text from messages for routing decisions self.messages .iter() .filter_map(|msg| match msg { ChatMessage::System { content, .. } => Some(content.clone()), ChatMessage::User { content, .. } => match content { UserMessageContent::Text(text) => Some(text.clone()), UserMessageContent::Parts(parts) => { let texts: Vec = parts .iter() .filter_map(|part| match part { ContentPart::Text { text } => Some(text.clone()), _ => None, }) .collect(); Some(texts.join(" ")) } }, ChatMessage::Assistant { content, reasoning_content, .. } => { // Combine content and reasoning content for routing decisions let main_content = content.clone().unwrap_or_default(); let reasoning = reasoning_content.clone().unwrap_or_default(); if main_content.is_empty() && reasoning.is_empty() { None } else { Some(format!("{} {}", main_content, reasoning).trim().to_string()) } } ChatMessage::Tool { content, .. } => Some(content.clone()), ChatMessage::Function { content, .. } => Some(content.clone()), }) .collect::>() .join(" ") } } // ============= Regular Response ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatCompletionResponse { pub id: String, pub object: String, // "chat.completion" pub created: u64, pub model: String, pub choices: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub usage: Option, #[serde(skip_serializing_if = "Option::is_none")] pub system_fingerprint: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatChoice { pub index: u32, pub message: ChatMessage, #[serde(skip_serializing_if = "Option::is_none")] pub logprobs: Option, pub finish_reason: Option, // "stop", "length", "tool_calls", "content_filter", "function_call" /// Information about which stop condition was matched #[serde(skip_serializing_if = "Option::is_none")] pub matched_stop: Option, // Can be string or integer /// Hidden states from the model (SGLang extension) #[serde(skip_serializing_if = "Option::is_none")] pub hidden_states: Option>, } // ============= Streaming Response ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatCompletionStreamResponse { pub id: String, pub object: String, // "chat.completion.chunk" pub created: u64, pub model: String, #[serde(skip_serializing_if = "Option::is_none")] pub system_fingerprint: Option, pub choices: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub usage: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatStreamChoice { pub index: u32, pub delta: ChatMessageDelta, #[serde(skip_serializing_if = "Option::is_none")] pub logprobs: Option, pub finish_reason: Option, } // ================================================================== // = OPENAI SPEC - Completions API = // ================================================================== // Completions API request types (v1/completions) - DEPRECATED but still supported #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionRequest { /// ID of the model to use (required for OpenAI, optional for some implementations, such as SGLang) pub model: String, /// The prompt(s) to generate completions for pub prompt: StringOrArray, /// The suffix that comes after a completion of inserted text #[serde(skip_serializing_if = "Option::is_none")] pub suffix: Option, /// The maximum number of tokens to generate #[serde(skip_serializing_if = "Option::is_none")] pub max_tokens: Option, /// What sampling temperature to use, between 0 and 2 #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, /// An alternative to sampling with temperature (nucleus sampling) #[serde(skip_serializing_if = "Option::is_none")] pub top_p: Option, /// How many completions to generate for each prompt #[serde(skip_serializing_if = "Option::is_none")] pub n: Option, /// Whether to stream back partial progress #[serde(default)] pub stream: bool, /// Options for streaming response #[serde(skip_serializing_if = "Option::is_none")] pub stream_options: Option, /// Include the log probabilities on the logprobs most likely tokens #[serde(skip_serializing_if = "Option::is_none")] pub logprobs: Option, /// Echo back the prompt in addition to the completion #[serde(default)] pub echo: bool, /// Up to 4 sequences where the API will stop generating further tokens #[serde(skip_serializing_if = "Option::is_none")] pub stop: Option, /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far #[serde(skip_serializing_if = "Option::is_none")] pub presence_penalty: Option, /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far #[serde(skip_serializing_if = "Option::is_none")] pub frequency_penalty: Option, /// Generates best_of completions server-side and returns the "best" #[serde(skip_serializing_if = "Option::is_none")] pub best_of: Option, /// Modify the likelihood of specified tokens appearing in the completion #[serde(skip_serializing_if = "Option::is_none")] pub logit_bias: Option>, /// A unique identifier representing your end-user #[serde(skip_serializing_if = "Option::is_none")] pub user: Option, /// If specified, our system will make a best effort to sample deterministically #[serde(skip_serializing_if = "Option::is_none")] pub seed: Option, // ============= SGLang Extensions ============= /// Top-k sampling parameter (-1 to disable) #[serde(skip_serializing_if = "Option::is_none")] pub top_k: Option, /// Min-p nucleus sampling parameter #[serde(skip_serializing_if = "Option::is_none")] pub min_p: Option, /// Minimum number of tokens to generate #[serde(skip_serializing_if = "Option::is_none")] pub min_tokens: Option, /// Repetition penalty for reducing repetitive text #[serde(skip_serializing_if = "Option::is_none")] pub repetition_penalty: Option, /// Regex constraint for output generation #[serde(skip_serializing_if = "Option::is_none")] pub regex: Option, /// EBNF grammar constraint for structured output #[serde(skip_serializing_if = "Option::is_none")] pub ebnf: Option, /// JSON schema constraint for structured output #[serde(skip_serializing_if = "Option::is_none")] pub json_schema: Option, /// Specific token IDs to use as stop conditions #[serde(skip_serializing_if = "Option::is_none")] pub stop_token_ids: Option>, /// Skip trimming stop tokens from output #[serde(default)] pub no_stop_trim: bool, /// Ignore end-of-sequence tokens during generation #[serde(default)] pub ignore_eos: bool, /// Skip special tokens during detokenization #[serde(default = "default_true")] pub skip_special_tokens: bool, // ============= SGLang Extensions ============= /// Path to LoRA adapter(s) for model customization #[serde(skip_serializing_if = "Option::is_none")] pub lora_path: Option, /// Session parameters for continual prompting #[serde(skip_serializing_if = "Option::is_none")] pub session_params: Option>, /// Return model hidden states #[serde(default)] pub return_hidden_states: bool, /// Additional fields including bootstrap info for PD routing #[serde(flatten)] pub other: serde_json::Map, } impl GenerationRequest for CompletionRequest { fn is_stream(&self) -> bool { self.stream } fn get_model(&self) -> Option<&str> { Some(&self.model) } fn extract_text_for_routing(&self) -> String { match &self.prompt { StringOrArray::String(s) => s.clone(), StringOrArray::Array(v) => v.join(" "), } } } // ============= Regular Response ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionResponse { pub id: String, pub object: String, // "text_completion" pub created: u64, pub model: String, pub choices: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub usage: Option, #[serde(skip_serializing_if = "Option::is_none")] pub system_fingerprint: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionChoice { pub text: String, pub index: u32, #[serde(skip_serializing_if = "Option::is_none")] pub logprobs: Option, pub finish_reason: Option, // "stop", "length", "content_filter", etc. /// Information about which stop condition was matched #[serde(skip_serializing_if = "Option::is_none")] pub matched_stop: Option, // Can be string or integer /// Hidden states from the model (SGLang extension) #[serde(skip_serializing_if = "Option::is_none")] pub hidden_states: Option>, } // ============= Streaming Response ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionStreamResponse { pub id: String, pub object: String, // "text_completion" pub created: u64, pub choices: Vec, pub model: String, #[serde(skip_serializing_if = "Option::is_none")] pub system_fingerprint: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionStreamChoice { pub text: String, pub index: u32, #[serde(skip_serializing_if = "Option::is_none")] pub logprobs: Option, pub finish_reason: Option, } // ================================================================== // = OPENAI SPEC - Responses API = // ================================================================== // ============= Tool Definitions ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ResponseTool { #[serde(rename = "type")] pub r#type: ResponseToolType, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum ResponseToolType { WebSearchPreview, CodeInterpreter, } // ============= Reasoning Configuration ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ResponseReasoningParam { #[serde(default = "default_reasoning_effort")] pub effort: Option, } fn default_reasoning_effort() -> Option { Some(ReasoningEffort::Medium) } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum ReasoningEffort { Low, Medium, High, } // ============= Input/Output Items ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] #[serde(rename_all = "snake_case")] pub enum ResponseInputOutputItem { #[serde(rename = "message")] Message { id: String, role: String, content: Vec, #[serde(skip_serializing_if = "Option::is_none")] status: Option, }, #[serde(rename = "reasoning")] Reasoning { id: String, #[serde(skip_serializing_if = "Vec::is_empty")] summary: Vec, content: Vec, #[serde(skip_serializing_if = "Option::is_none")] status: Option, }, #[serde(rename = "function_tool_call")] FunctionToolCall { id: String, name: String, arguments: String, #[serde(skip_serializing_if = "Option::is_none")] output: Option, #[serde(skip_serializing_if = "Option::is_none")] status: Option, }, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] #[serde(rename_all = "snake_case")] pub enum ResponseContentPart { #[serde(rename = "output_text")] OutputText { text: String, #[serde(skip_serializing_if = "Vec::is_empty")] annotations: Vec, #[serde(skip_serializing_if = "Option::is_none")] logprobs: Option, }, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] #[serde(rename_all = "snake_case")] pub enum ResponseReasoningContent { #[serde(rename = "reasoning_text")] ReasoningText { text: String }, } // ============= Output Items for Response ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(tag = "type")] #[serde(rename_all = "snake_case")] pub enum ResponseOutputItem { #[serde(rename = "message")] Message { id: String, role: String, content: Vec, status: String, }, #[serde(rename = "reasoning")] Reasoning { id: String, #[serde(skip_serializing_if = "Vec::is_empty")] summary: Vec, content: Vec, #[serde(skip_serializing_if = "Option::is_none")] status: Option, }, #[serde(rename = "function_tool_call")] FunctionToolCall { id: String, name: String, arguments: String, #[serde(skip_serializing_if = "Option::is_none")] output: Option, status: String, }, } // ============= Service Tier ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum ServiceTier { Auto, Default, Flex, Scale, Priority, } impl Default for ServiceTier { fn default() -> Self { Self::Auto } } // ============= Truncation ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum Truncation { Auto, Disabled, } impl Default for Truncation { fn default() -> Self { Self::Disabled } } // ============= Response Status ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum ResponseStatus { Queued, InProgress, Completed, Failed, Cancelled, } // ============= Include Fields ============= #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum IncludeField { #[serde(rename = "code_interpreter_call.outputs")] CodeInterpreterCallOutputs, #[serde(rename = "computer_call_output.output.image_url")] ComputerCallOutputImageUrl, #[serde(rename = "file_search_call.results")] FileSearchCallResults, #[serde(rename = "message.input_image.image_url")] MessageInputImageUrl, #[serde(rename = "message.output_text.logprobs")] MessageOutputTextLogprobs, #[serde(rename = "reasoning.encrypted_content")] ReasoningEncryptedContent, } // ============= Usage Info ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct UsageInfo { pub prompt_tokens: u32, pub completion_tokens: u32, pub total_tokens: u32, #[serde(skip_serializing_if = "Option::is_none")] pub reasoning_tokens: Option, #[serde(skip_serializing_if = "Option::is_none")] pub prompt_tokens_details: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct PromptTokenUsageInfo { pub cached_tokens: u32, } // ============= Response Usage Format ============= /// OpenAI Responses API usage format (different from standard UsageInfo) #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ResponseUsage { pub input_tokens: u32, pub output_tokens: u32, pub total_tokens: u32, #[serde(skip_serializing_if = "Option::is_none")] pub input_tokens_details: Option, #[serde(skip_serializing_if = "Option::is_none")] pub output_tokens_details: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct InputTokensDetails { pub cached_tokens: u32, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct OutputTokensDetails { pub reasoning_tokens: u32, } impl UsageInfo { /// Convert to OpenAI Responses API format pub fn to_response_usage(&self) -> ResponseUsage { ResponseUsage { input_tokens: self.prompt_tokens, output_tokens: self.completion_tokens, total_tokens: self.total_tokens, input_tokens_details: self.prompt_tokens_details.as_ref().map(|details| { InputTokensDetails { cached_tokens: details.cached_tokens, } }), output_tokens_details: self.reasoning_tokens.map(|tokens| OutputTokensDetails { reasoning_tokens: tokens, }), } } } impl From for ResponseUsage { fn from(usage: UsageInfo) -> Self { usage.to_response_usage() } } impl ResponseUsage { /// Convert back to standard UsageInfo format pub fn to_usage_info(&self) -> UsageInfo { UsageInfo { prompt_tokens: self.input_tokens, completion_tokens: self.output_tokens, total_tokens: self.total_tokens, reasoning_tokens: self .output_tokens_details .as_ref() .map(|details| details.reasoning_tokens), prompt_tokens_details: self.input_tokens_details.as_ref().map(|details| { PromptTokenUsageInfo { cached_tokens: details.cached_tokens, } }), } } } fn generate_request_id() -> String { format!("resp_{}", uuid::Uuid::new_v4().simple()) } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ResponsesRequest { // ============= Core OpenAI API fields ============= /// Run the request in the background #[serde(default)] pub background: bool, /// Fields to include in the response #[serde(skip_serializing_if = "Option::is_none")] pub include: Option>, /// Input content - can be string or structured items pub input: ResponseInput, /// System instructions for the model #[serde(skip_serializing_if = "Option::is_none")] pub instructions: Option, /// Maximum number of output tokens #[serde(skip_serializing_if = "Option::is_none")] pub max_output_tokens: Option, /// Maximum number of tool calls #[serde(skip_serializing_if = "Option::is_none")] pub max_tool_calls: Option, /// Additional metadata #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option>, /// Model to use (optional to match vLLM) #[serde(skip_serializing_if = "Option::is_none")] pub model: Option, /// Whether to enable parallel tool calls #[serde(default = "default_true")] pub parallel_tool_calls: bool, /// ID of previous response to continue from #[serde(skip_serializing_if = "Option::is_none")] pub previous_response_id: Option, /// Reasoning configuration #[serde(skip_serializing_if = "Option::is_none")] pub reasoning: Option, /// Service tier #[serde(default)] pub service_tier: ServiceTier, /// Whether to store the response #[serde(default = "default_true")] pub store: bool, /// Whether to stream the response #[serde(default)] pub stream: bool, /// Temperature for sampling #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, /// Tool choice behavior #[serde(default)] pub tool_choice: ToolChoice, /// Available tools #[serde(default)] pub tools: Vec, /// Number of top logprobs to return #[serde(default)] pub top_logprobs: u32, /// Top-p sampling parameter #[serde(skip_serializing_if = "Option::is_none")] pub top_p: Option, /// Truncation behavior #[serde(default)] pub truncation: Truncation, /// User identifier #[serde(skip_serializing_if = "Option::is_none")] pub user: Option, // ============= SGLang Extensions ============= /// Request ID #[serde(default = "generate_request_id")] pub request_id: String, /// Request priority #[serde(default)] pub priority: i32, /// Frequency penalty #[serde(default)] pub frequency_penalty: f32, /// Presence penalty #[serde(default)] pub presence_penalty: f32, /// Stop sequences #[serde(skip_serializing_if = "Option::is_none")] pub stop: Option, /// Top-k sampling parameter #[serde(default = "default_top_k")] pub top_k: i32, /// Min-p sampling parameter #[serde(default)] pub min_p: f32, /// Repetition penalty #[serde(default = "default_repetition_penalty")] pub repetition_penalty: f32, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum ResponseInput { Text(String), Items(Vec), } fn default_top_k() -> i32 { -1 } fn default_repetition_penalty() -> f32 { 1.0 } impl ResponsesRequest { /// Default sampling parameters const DEFAULT_TEMPERATURE: f32 = 0.7; const DEFAULT_TOP_P: f32 = 1.0; /// Convert to sampling parameters for generation pub fn to_sampling_params( &self, default_max_tokens: u32, default_params: Option>, ) -> HashMap { let mut params = HashMap::new(); // Use max_output_tokens if available let max_tokens = if let Some(max_output) = self.max_output_tokens { std::cmp::min(max_output, default_max_tokens) } else { default_max_tokens }; // Avoid exceeding context length by minus 1 token let max_tokens = max_tokens.saturating_sub(1); // Temperature let temperature = self.temperature.unwrap_or_else(|| { default_params .as_ref() .and_then(|p| p.get("temperature")) .and_then(|v| v.as_f64()) .map(|v| v as f32) .unwrap_or(Self::DEFAULT_TEMPERATURE) }); // Top-p let top_p = self.top_p.unwrap_or_else(|| { default_params .as_ref() .and_then(|p| p.get("top_p")) .and_then(|v| v.as_f64()) .map(|v| v as f32) .unwrap_or(Self::DEFAULT_TOP_P) }); params.insert( "max_new_tokens".to_string(), serde_json::Value::Number(serde_json::Number::from(max_tokens)), ); params.insert( "temperature".to_string(), serde_json::Value::Number(serde_json::Number::from_f64(temperature as f64).unwrap()), ); params.insert( "top_p".to_string(), serde_json::Value::Number(serde_json::Number::from_f64(top_p as f64).unwrap()), ); params.insert( "frequency_penalty".to_string(), serde_json::Value::Number( serde_json::Number::from_f64(self.frequency_penalty as f64).unwrap(), ), ); params.insert( "presence_penalty".to_string(), serde_json::Value::Number( serde_json::Number::from_f64(self.presence_penalty as f64).unwrap(), ), ); params.insert( "top_k".to_string(), serde_json::Value::Number(serde_json::Number::from(self.top_k)), ); params.insert( "min_p".to_string(), serde_json::Value::Number(serde_json::Number::from_f64(self.min_p as f64).unwrap()), ); params.insert( "repetition_penalty".to_string(), serde_json::Value::Number( serde_json::Number::from_f64(self.repetition_penalty as f64).unwrap(), ), ); if let Some(ref stop) = self.stop { match serde_json::to_value(stop) { Ok(value) => params.insert("stop".to_string(), value), Err(_) => params.insert("stop".to_string(), serde_json::Value::Null), }; } // Apply any additional default parameters if let Some(default_params) = default_params { for (key, value) in default_params { params.entry(key).or_insert(value); } } params } } impl GenerationRequest for ResponsesRequest { fn is_stream(&self) -> bool { self.stream } fn get_model(&self) -> Option<&str> { self.model.as_deref() } fn extract_text_for_routing(&self) -> String { match &self.input { ResponseInput::Text(text) => text.clone(), ResponseInput::Items(items) => items .iter() .filter_map(|item| match item { ResponseInputOutputItem::Message { content, .. } => { let texts: Vec = content .iter() .map(|part| match part { ResponseContentPart::OutputText { text, .. } => text.clone(), }) .collect(); if texts.is_empty() { None } else { Some(texts.join(" ")) } } ResponseInputOutputItem::Reasoning { content, .. } => { let texts: Vec = content .iter() .map(|part| match part { ResponseReasoningContent::ReasoningText { text } => text.clone(), }) .collect(); if texts.is_empty() { None } else { Some(texts.join(" ")) } } ResponseInputOutputItem::FunctionToolCall { arguments, .. } => { Some(arguments.clone()) } }) .collect::>() .join(" "), } } } fn generate_response_id() -> String { format!("resp_{}", uuid::Uuid::new_v4().simple()) } fn current_timestamp() -> i64 { std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_else(|_| std::time::Duration::from_secs(0)) .as_secs() as i64 } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ResponsesResponse { /// Response ID #[serde(default = "generate_response_id")] pub id: String, /// Object type #[serde(default = "default_object_type")] pub object: String, /// Creation timestamp #[serde(default = "current_timestamp")] pub created_at: i64, /// Model name pub model: String, /// Output items #[serde(default)] pub output: Vec, /// Response status pub status: ResponseStatus, /// Usage information #[serde(skip_serializing_if = "Option::is_none")] pub usage: Option, /// Whether parallel tool calls are enabled #[serde(default = "default_true")] pub parallel_tool_calls: bool, /// Tool choice setting #[serde(default = "default_tool_choice")] pub tool_choice: String, /// Available tools #[serde(default)] pub tools: Vec, } fn default_object_type() -> String { "response".to_string() } fn default_tool_choice() -> String { "auto".to_string() } impl ResponsesResponse { /// Create a response from a request #[allow(clippy::too_many_arguments)] pub fn from_request( request: &ResponsesRequest, _sampling_params: &HashMap, model_name: String, created_time: i64, output: Vec, status: ResponseStatus, usage: Option, ) -> Self { Self { id: request.request_id.clone(), object: "response".to_string(), created_at: created_time, model: model_name, output, status, usage, parallel_tool_calls: request.parallel_tool_calls, tool_choice: match &request.tool_choice { ToolChoice::Value(ToolChoiceValue::Auto) => "auto".to_string(), ToolChoice::Value(ToolChoiceValue::Required) => "required".to_string(), ToolChoice::Value(ToolChoiceValue::None) => "none".to_string(), ToolChoice::Function { .. } => "function".to_string(), }, tools: request.tools.clone(), } } /// Create a new response with default values pub fn new(request_id: String, model: String, status: ResponseStatus) -> Self { Self { id: request_id, object: "response".to_string(), created_at: current_timestamp(), model, output: Vec::new(), status, usage: None, parallel_tool_calls: true, tool_choice: "auto".to_string(), tools: Vec::new(), } } /// Add an output item to the response pub fn add_output(&mut self, item: ResponseOutputItem) { self.output.push(item); } /// Set the usage information pub fn set_usage(&mut self, usage: UsageInfo) { self.usage = Some(usage); } /// Update the status pub fn set_status(&mut self, status: ResponseStatus) { self.status = status; } /// Check if the response is complete pub fn is_complete(&self) -> bool { matches!(self.status, ResponseStatus::Completed) } /// Check if the response is in progress pub fn is_in_progress(&self) -> bool { matches!(self.status, ResponseStatus::InProgress) } /// Check if the response failed pub fn is_failed(&self) -> bool { matches!(self.status, ResponseStatus::Failed) } /// Check if the response was cancelled pub fn is_cancelled(&self) -> bool { matches!(self.status, ResponseStatus::Cancelled) } /// Check if the response is queued pub fn is_queued(&self) -> bool { matches!(self.status, ResponseStatus::Queued) } /// Convert usage to OpenAI Responses API format pub fn usage_in_response_format(&self) -> Option { self.usage.as_ref().map(|usage| usage.to_response_usage()) } /// Get the response as a JSON value with usage in response format pub fn to_response_format(&self) -> serde_json::Value { let mut response = serde_json::to_value(self).unwrap_or(serde_json::Value::Null); // Convert usage to response format if present if let Some(usage) = &self.usage { if let Ok(usage_value) = serde_json::to_value(usage.to_response_usage()) { response["usage"] = usage_value; } } response } } // ============= Helper Functions ============= impl ResponseOutputItem { /// Create a new message output item pub fn new_message( id: String, role: String, content: Vec, status: String, ) -> Self { Self::Message { id, role, content, status, } } /// Create a new reasoning output item pub fn new_reasoning( id: String, summary: Vec, content: Vec, status: Option, ) -> Self { Self::Reasoning { id, summary, content, status, } } /// Create a new function tool call output item pub fn new_function_tool_call( id: String, name: String, arguments: String, output: Option, status: String, ) -> Self { Self::FunctionToolCall { id, name, arguments, output, status, } } } impl ResponseContentPart { /// Create a new text content part pub fn new_text( text: String, annotations: Vec, logprobs: Option, ) -> Self { Self::OutputText { text, annotations, logprobs, } } } impl ResponseReasoningContent { /// Create a new reasoning text content pub fn new_reasoning_text(text: String) -> Self { Self::ReasoningText { text } } } impl UsageInfo { /// Create a new usage info with token counts pub fn new(prompt_tokens: u32, completion_tokens: u32, reasoning_tokens: Option) -> Self { Self { prompt_tokens, completion_tokens, total_tokens: prompt_tokens + completion_tokens, reasoning_tokens, prompt_tokens_details: None, } } /// Create usage info with cached token details pub fn new_with_cached( prompt_tokens: u32, completion_tokens: u32, reasoning_tokens: Option, cached_tokens: u32, ) -> Self { Self { prompt_tokens, completion_tokens, total_tokens: prompt_tokens + completion_tokens, reasoning_tokens, prompt_tokens_details: Some(PromptTokenUsageInfo { cached_tokens }), } } } // ================================================================== // = OPENAI SPEC - Common = // ================================================================== // ============= Shared Request Components ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct StreamOptions { #[serde(skip_serializing_if = "Option::is_none")] pub include_usage: Option, } // ============= Tool Choice Types ============= /// Tool choice value for simple string options #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] pub enum ToolChoiceValue { Auto, Required, None, } /// Tool choice for both Chat Completion and Responses APIs #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum ToolChoice { Value(ToolChoiceValue), Function { #[serde(rename = "type")] tool_type: String, // "function" function: FunctionChoice, }, } impl Default for ToolChoice { fn default() -> Self { Self::Value(ToolChoiceValue::Auto) } } /// Function choice specification for ToolChoice::Function #[derive(Debug, Clone, Deserialize, Serialize)] pub struct FunctionChoice { pub name: String, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Tool { #[serde(rename = "type")] pub tool_type: String, // "function" pub function: Function, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Function { pub name: String, #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, pub parameters: Value, // JSON Schema } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ToolCall { pub id: String, #[serde(rename = "type")] pub tool_type: String, // "function" pub function: FunctionCallResponse, } #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum FunctionCall { None, Auto, Function { name: String }, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct FunctionCallResponse { pub name: String, #[serde(default)] pub arguments: Option, // JSON string } // ============= Usage Tracking ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct Usage { pub prompt_tokens: u32, pub completion_tokens: u32, pub total_tokens: u32, #[serde(skip_serializing_if = "Option::is_none")] pub completion_tokens_details: Option, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct CompletionTokensDetails { pub reasoning_tokens: Option, } // ============= Logprobs Types ============= #[derive(Debug, Clone, Deserialize, Serialize)] pub struct LogProbs { pub tokens: Vec, pub token_logprobs: Vec>, pub top_logprobs: Vec>>, pub text_offset: Vec, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatLogProbs { pub content: Option>, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ChatLogProbsContent { pub token: String, pub logprob: f32, pub bytes: Option>, pub top_logprobs: Vec, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct TopLogProb { pub token: String, pub logprob: f32, pub bytes: Option>, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ErrorResponse { pub error: ErrorDetail, } #[derive(Debug, Clone, Deserialize, Serialize)] pub struct ErrorDetail { pub message: String, #[serde(rename = "type")] pub error_type: String, #[serde(skip_serializing_if = "Option::is_none")] pub param: Option, #[serde(skip_serializing_if = "Option::is_none")] pub code: Option, } // ================================================================== // = SGLANG SPEC - GENERATE API = // ================================================================== #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum InputIds { Single(Vec), Batch(Vec>), } #[derive(Debug, Clone, Deserialize, Serialize, Default)] pub struct GenerateParameters { #[serde(skip_serializing_if = "Option::is_none")] pub best_of: Option, #[serde(skip_serializing_if = "Option::is_none")] pub decoder_input_details: Option, #[serde(skip_serializing_if = "Option::is_none")] pub details: Option, #[serde(skip_serializing_if = "Option::is_none")] pub do_sample: Option, #[serde(skip_serializing_if = "Option::is_none")] pub max_new_tokens: Option, #[serde(skip_serializing_if = "Option::is_none")] pub repetition_penalty: Option, #[serde(skip_serializing_if = "Option::is_none")] pub return_full_text: Option, #[serde(skip_serializing_if = "Option::is_none")] pub seed: Option, #[serde(skip_serializing_if = "Option::is_none")] pub stop: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, #[serde(skip_serializing_if = "Option::is_none")] pub top_k: Option, #[serde(skip_serializing_if = "Option::is_none")] pub top_p: Option, #[serde(skip_serializing_if = "Option::is_none")] pub truncate: Option, #[serde(skip_serializing_if = "Option::is_none")] pub typical_p: Option, #[serde(skip_serializing_if = "Option::is_none")] pub watermark: Option, } #[derive(Debug, Clone, Deserialize, Serialize, Default)] pub struct SamplingParams { #[serde(skip_serializing_if = "Option::is_none")] pub temperature: Option, #[serde(skip_serializing_if = "Option::is_none")] pub max_new_tokens: Option, #[serde(skip_serializing_if = "Option::is_none")] pub top_p: Option, #[serde(skip_serializing_if = "Option::is_none")] pub top_k: Option, #[serde(skip_serializing_if = "Option::is_none")] pub frequency_penalty: Option, #[serde(skip_serializing_if = "Option::is_none")] pub presence_penalty: Option, #[serde(skip_serializing_if = "Option::is_none")] pub repetition_penalty: Option, #[serde(skip_serializing_if = "Option::is_none")] pub stop: Option, #[serde(skip_serializing_if = "Option::is_none")] pub ignore_eos: Option, #[serde(skip_serializing_if = "Option::is_none")] pub skip_special_tokens: Option, #[serde(skip_serializing_if = "Option::is_none")] pub json_schema: Option, #[serde(skip_serializing_if = "Option::is_none")] pub regex: Option, #[serde(skip_serializing_if = "Option::is_none")] pub ebnf: Option, #[serde(skip_serializing_if = "Option::is_none")] pub min_p: Option, #[serde(skip_serializing_if = "Option::is_none")] pub min_tokens: Option, #[serde(skip_serializing_if = "Option::is_none")] pub stop_token_ids: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub no_stop_trim: Option, } #[derive(Clone, Debug, Serialize, Deserialize)] pub struct GenerateRequest { /// The prompt to generate from (OpenAI style) #[serde(skip_serializing_if = "Option::is_none")] pub prompt: Option, /// Text input - SGLang native format #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, /// Input IDs for tokenized input #[serde(skip_serializing_if = "Option::is_none")] pub input_ids: Option, /// Generation parameters #[serde(default, skip_serializing_if = "Option::is_none")] pub parameters: Option, /// Sampling parameters (sglang style) #[serde(skip_serializing_if = "Option::is_none")] pub sampling_params: Option, /// Whether to stream the response #[serde(default)] pub stream: bool, /// Whether to return logprobs #[serde(default)] pub return_logprob: bool, // ============= SGLang Extensions ============= /// Path to LoRA adapter(s) for model customization #[serde(skip_serializing_if = "Option::is_none")] pub lora_path: Option, /// Session parameters for continual prompting #[serde(skip_serializing_if = "Option::is_none")] pub session_params: Option>, /// Return model hidden states #[serde(default)] pub return_hidden_states: bool, /// Request ID for tracking #[serde(skip_serializing_if = "Option::is_none")] pub rid: Option, } impl GenerationRequest for GenerateRequest { fn is_stream(&self) -> bool { self.stream } fn get_model(&self) -> Option<&str> { // Generate requests typically don't have a model field None } fn extract_text_for_routing(&self) -> String { // Check fields in priority order: text, prompt, inputs if let Some(ref text) = self.text { return text.clone(); } if let Some(ref prompt) = self.prompt { return match prompt { StringOrArray::String(s) => s.clone(), StringOrArray::Array(v) => v.join(" "), }; } if let Some(ref input_ids) = self.input_ids { return match input_ids { InputIds::Single(ids) => ids .iter() .map(|&id| id.to_string()) .collect::>() .join(" "), InputIds::Batch(batches) => batches .iter() .flat_map(|batch| batch.iter().map(|&id| id.to_string())) .collect::>() .join(" "), }; } // No text input found String::new() } } // ================================================================== // = COMMON = // ================================================================== /// Helper function for serde default value pub fn default_true() -> bool { true } /// Common trait for all generation requests across different APIs pub trait GenerationRequest: Send + Sync { /// Check if the request is for streaming fn is_stream(&self) -> bool; /// Get the model name if specified fn get_model(&self) -> Option<&str>; /// Extract text content for routing decisions fn extract_text_for_routing(&self) -> String; } /// Helper type for string or array of strings #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum StringOrArray { String(String), Array(Vec), } impl StringOrArray { /// Get the number of items in the StringOrArray pub fn len(&self) -> usize { match self { StringOrArray::String(_) => 1, StringOrArray::Array(arr) => arr.len(), } } /// Check if the StringOrArray is empty pub fn is_empty(&self) -> bool { match self { StringOrArray::String(s) => s.is_empty(), StringOrArray::Array(arr) => arr.is_empty(), } } /// Convert to a vector of strings pub fn to_vec(&self) -> Vec { match self { StringOrArray::String(s) => vec![s.clone()], StringOrArray::Array(arr) => arr.clone(), } } } /// LoRA adapter path - can be single path or batch of paths (SGLang extension) #[derive(Debug, Clone, Deserialize, Serialize)] #[serde(untagged)] pub enum LoRAPath { Single(Option), Batch(Vec>), }