[router][grpc] Implement tool_choice support for Responses API (#12668)

9f5e7018 · Chang Su · GitHub · cbf23dbb · 9f5e7018 · 9f5e7018
Unverified Commit 9f5e7018 authored Nov 04, 2025 by Chang Su Committed by GitHub Nov 04, 2025
14 changed files
--- a/sgl-router/src/grpc_client/sglang_scheduler.rs
+++ b/sgl-router/src/grpc_client/sglang_scheduler.rs
@@ -303,6 +303,9 @@ impl SglangSchedulerClient {
    }

    /// Build a GenerateRequest from ResponsesRequest (OpenAI Responses API)
+    ///
+    /// NOTE: This is used by the Harmony router only. The Regular router uses
+    /// responses_to_chat() conversion and goes through the chat pipeline.
    pub fn build_generate_request_from_responses(
        &self,
        request_id: String,
@@ -310,9 +313,11 @@ impl SglangSchedulerClient {
        processed_text: String,
        token_ids: Vec<u32>,
        harmony_stop_ids: Option<Vec<u32>>,
+        tool_call_constraint: Option<(String, String)>,
    ) -> Result<proto::GenerateRequest, String> {
        // Build sampling params from ResponsesRequest
-        let mut sampling_params = self.build_grpc_sampling_params_from_responses(body)?;
+        let mut sampling_params =
+            self.build_grpc_sampling_params_from_responses(body, tool_call_constraint)?;

        // Inject Harmony stop token IDs if provided
        if let Some(stop_ids) = harmony_stop_ids {
@@ -441,9 +446,10 @@ impl SglangSchedulerClient {
    fn build_grpc_sampling_params_from_responses(
        &self,
        request: &ResponsesRequest,
+        tool_call_constraint: Option<(String, String)>,
    ) -> Result<proto::SamplingParams, String> {
        // ResponsesRequest doesn't have stop sequences in the same way
-        // Tools are handled externally by MCP loop, not via constraints
+        // For Harmony router: Tools are handled via structural_tag constraints

        let max_new_tokens = request.max_output_tokens.map(|v| v as i32);

@@ -462,12 +468,36 @@ impl SglangSchedulerClient {
            spaces_between_special_tokens: true,
            ignore_eos: false,
            no_stop_trim: false,
-            n: 1,             // Responses API doesn't support n>1
-            constraint: None, // No constraints - tools handled by MCP
+            n: 1, // Responses API doesn't support n>1
+            constraint: self.build_constraint_for_responses(tool_call_constraint)?,
            ..Default::default()
        })
    }

+    /// Build constraint for Responses API (simpler than Chat API's build_constraint)
+    ///
+    /// Responses API doesn't support response_format, ebnf, or regex constraints,
+    /// so this only handles tool_call_constraint.
+    fn build_constraint_for_responses(
+        &self,
+        tool_call_constraint: Option<(String, String)>,
+    ) -> Result<Option<proto::sampling_params::Constraint>, String> {
+        if let Some((constraint_type, constraint_value)) = tool_call_constraint {
+            let tool_constraint = match constraint_type.as_str() {
+                "structural_tag" => {
+                    proto::sampling_params::Constraint::StructuralTag(constraint_value)
+                }
+                "json_schema" => proto::sampling_params::Constraint::JsonSchema(constraint_value),
+                "ebnf" => proto::sampling_params::Constraint::EbnfGrammar(constraint_value),
+                "regex" => proto::sampling_params::Constraint::Regex(constraint_value),
+                _ => return Err(format!("Unknown constraint type: {}", constraint_type)),
+            };
+            Ok(Some(tool_constraint))
+        } else {
+            Ok(None)
+        }
+    }
+
    fn build_single_constraint_from_plain(
        params: &GenerateSamplingParams,
    ) -> Result<Option<proto::sampling_params::Constraint>, String> {

--- a/sgl-router/src/protocols/chat.rs
+++ b/sgl-router/src/protocols/chat.rs
@@ -457,21 +457,43 @@ fn validate_chat_cross_parameters(
                        return Err(e);
                    }

-                    // Validate that all referenced tool names exist in tools
+                    // Validate that all ToolReferences are Function type (Chat API only supports function tools)
                    for tool_ref in allowed_tools {
-                        let tool_exists = tools.iter().any(|tool| {
-                            tool.tool_type == tool_ref.tool_type
-                                && tool.function.name == tool_ref.name
-                        });
-
-                        if !tool_exists {
-                            let mut e =
-                                validator::ValidationError::new("tool_choice_tool_not_found");
-                            e.message = Some(format!(
-                                "Invalid value for 'tool_choice.tools': tool '{}' not found in 'tools'.",
-                                tool_ref.name
-                            ).into());
-                            return Err(e);
+                        match tool_ref {
+                            ToolReference::Function { name } => {
+                                // Validate that the function exists in tools array
+                                let tool_exists = tools.iter().any(|tool| {
+                                    tool.tool_type == "function" && tool.function.name == *name
+                                });
+
+                                if !tool_exists {
+                                    let mut e = validator::ValidationError::new(
+                                        "tool_choice_tool_not_found",
+                                    );
+                                    e.message = Some(
+                                        format!(
+                                            "Invalid value for 'tool_choice.tools': tool '{}' not found in 'tools'.",
+                                            name
+                                        )
+                                        .into(),
+                                    );
+                                    return Err(e);
+                                }
+                            }
+                            _ => {
+                                // Chat Completion API only supports function tools in tool_choice
+                                let mut e = validator::ValidationError::new(
+                                    "tool_choice_invalid_tool_type",
+                                );
+                                e.message = Some(
+                                    format!(
+                                        "Invalid value for 'tool_choice.tools': Chat Completion API only supports function tools, got '{}'.",
+                                        tool_ref.identifier()
+                                    )
+                                    .into(),
+                                );
+                                return Err(e);
+                            }
                        }
                    }
                }

--- a/sgl-router/src/protocols/common.rs
+++ b/sgl-router/src/protocols/common.rs
@@ -183,6 +183,18 @@ impl Default for ToolChoice {
    }
 }

+impl ToolChoice {
+    /// Serialize tool_choice to string for ResponsesResponse
+    ///
+    /// Returns the JSON-serialized tool_choice or "auto" as default
+    pub fn serialize_to_string(tool_choice: &Option<ToolChoice>) -> String {
+        tool_choice
+            .as_ref()
+            .map(|tc| serde_json::to_string(tc).unwrap_or_else(|_| "auto".to_string()))
+            .unwrap_or_else(|| "auto".to_string())
+    }
+}
+
 /// Function choice specification for ToolChoice::Function
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct FunctionChoice {
@@ -190,11 +202,73 @@ pub struct FunctionChoice {
 }

 /// Tool reference for ToolChoice::AllowedTools
+///
+/// Represents a reference to a specific tool in the allowed_tools array.
+/// Different tool types have different required fields.
 #[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct ToolReference {
-    #[serde(rename = "type")]
-    pub tool_type: String, // "function"
-    pub name: String,
+#[serde(tag = "type")]
+#[serde(rename_all = "snake_case")]
+pub enum ToolReference {
+    /// Reference to a function tool
+    #[serde(rename = "function")]
+    Function { name: String },
+
+    /// Reference to an MCP tool
+    #[serde(rename = "mcp")]
+    Mcp {
+        server_label: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        name: Option<String>,
+    },
+
+    /// File search hosted tool
+    #[serde(rename = "file_search")]
+    FileSearch,
+
+    /// Web search preview hosted tool
+    #[serde(rename = "web_search_preview")]
+    WebSearchPreview,
+
+    /// Computer use preview hosted tool
+    #[serde(rename = "computer_use_preview")]
+    ComputerUsePreview,
+
+    /// Code interpreter hosted tool
+    #[serde(rename = "code_interpreter")]
+    CodeInterpreter,
+
+    /// Image generation hosted tool
+    #[serde(rename = "image_generation")]
+    ImageGeneration,
+}
+
+impl ToolReference {
+    /// Get a unique identifier for this tool reference
+    pub fn identifier(&self) -> String {
+        match self {
+            ToolReference::Function { name } => format!("function:{}", name),
+            ToolReference::Mcp { server_label, name } => {
+                if let Some(n) = name {
+                    format!("mcp:{}:{}", server_label, n)
+                } else {
+                    format!("mcp:{}", server_label)
+                }
+            }
+            ToolReference::FileSearch => "file_search".to_string(),
+            ToolReference::WebSearchPreview => "web_search_preview".to_string(),
+            ToolReference::ComputerUsePreview => "computer_use_preview".to_string(),
+            ToolReference::CodeInterpreter => "code_interpreter".to_string(),
+            ToolReference::ImageGeneration => "image_generation".to_string(),
+        }
+    }
+
+    /// Get the tool name if this is a function tool
+    pub fn function_name(&self) -> Option<&str> {
+        match self {
+            ToolReference::Function { name } => Some(name.as_str()),
+            _ => None,
+        }
+    }
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]

--- a/sgl-router/src/protocols/responses.rs
+++ b/sgl-router/src/protocols/responses.rs
@@ -447,6 +447,7 @@ fn default_top_p() -> Option<f32> {
 // ============================================================================

 #[derive(Debug, Clone, Deserialize, Serialize, Validate)]
+#[validate(schema(function = "validate_responses_cross_parameters"))]
 pub struct ResponsesRequest {
    /// Run the request in the background
    #[serde(skip_serializing_if = "Option::is_none")]
@@ -721,6 +722,83 @@ pub fn validate_conversation_id(conv_id: &str) -> Result<(), validator::Validati
    Ok(())
 }

+/// Schema-level validation for cross-field dependencies
+fn validate_responses_cross_parameters(
+    request: &ResponsesRequest,
+) -> Result<(), validator::ValidationError> {
+    use super::common::{ToolChoice, ToolReference};
+
+    // Only validate if both tools and tool_choice are present
+    if let (Some(tools), Some(tool_choice)) = (&request.tools, &request.tool_choice) {
+        // Extract function tool names from ResponseTools
+        let function_tool_names: Vec<&str> = tools
+            .iter()
+            .filter_map(|t| match t.r#type {
+                ResponseToolType::Function => t.function.as_ref().map(|f| f.name.as_str()),
+                _ => None,
+            })
+            .collect();
+
+        match tool_choice {
+            ToolChoice::Function { function, .. } => {
+                // Validate the specific function exists
+                if !function_tool_names.contains(&function.name.as_str()) {
+                    let mut e = validator::ValidationError::new("tool_choice_function_not_found");
+                    e.message = Some(
+                        format!(
+                            "Invalid value for 'tool_choice': function '{}' not found in 'tools'.",
+                            function.name
+                        )
+                        .into(),
+                    );
+                    return Err(e);
+                }
+            }
+            ToolChoice::AllowedTools {
+                mode,
+                tools: allowed_tools,
+                ..
+            } => {
+                // Validate mode is "auto" or "required"
+                if mode != "auto" && mode != "required" {
+                    let mut e = validator::ValidationError::new("tool_choice_invalid_mode");
+                    e.message = Some(
+                        format!(
+                            "Invalid value for 'tool_choice.mode': must be 'auto' or 'required', got '{}'.",
+                            mode
+                        )
+                        .into(),
+                    );
+                    return Err(e);
+                }
+
+                // Validate that all function tool references exist
+                for tool_ref in allowed_tools {
+                    if let ToolReference::Function { name } = tool_ref {
+                        if !function_tool_names.contains(&name.as_str()) {
+                            let mut e =
+                                validator::ValidationError::new("tool_choice_tool_not_found");
+                            e.message = Some(
+                                format!(
+                                    "Invalid value for 'tool_choice.tools': tool '{}' not found in 'tools'.",
+                                    name
+                                )
+                                .into(),
+                            );
+                            return Err(e);
+                        }
+                    }
+                    // Note: MCP and hosted tools don't need existence validation here
+                    // as they are resolved dynamically at runtime
+                }
+            }
+            _ => {}
+        }
+    }
+
+    Ok(())
+}
+
 /// Normalize a SimpleInputMessage to a proper Message item
 ///
 /// This helper converts SimpleInputMessage (which can have flexible content)

--- a/sgl-router/src/routers/grpc/common/responses/utils.rs
+++ b/sgl-router/src/routers/grpc/common/responses/utils.rs
@@ -11,7 +11,10 @@ use serde_json::json;
 use crate::{
    core::WorkerRegistry,
    mcp::McpManager,
-    protocols::responses::{ResponseTool, ResponseToolType},
+    protocols::{
+        common::Tool,
+        responses::{ResponseTool, ResponseToolType},
+    },
    routers::{grpc::error, openai::mcp::ensure_request_mcp_client},
 };

@@ -76,3 +79,47 @@ pub fn validate_worker_availability(

    None
 }
+
+/// Extract function tools (and optionally MCP tools) from ResponseTools
+///
+/// This utility consolidates the logic for extracting tools with schemas from ResponseTools.
+/// It's used by both Harmony and Regular routers for different purposes:
+///
+/// - **Harmony router**: Extracts both Function and MCP tools (with `include_mcp: true`)
+///   because MCP schemas are populated by convert_mcp_tools_to_response_tools() before the
+///   pipeline runs. These tools are used to generate structural constraints in the
+///   Harmony preparation stage.
+///
+/// - **Regular router**: Extracts only Function tools (with `include_mcp: false`) during
+///   the initial conversion from ResponsesRequest to ChatCompletionRequest. MCP tools
+///   are merged later by the tool loop before being sent to the chat pipeline, where
+///   tool_choice constraints are generated for ALL tools (function + MCP combined).
+pub fn extract_tools_from_response_tools(
+    response_tools: Option<&[ResponseTool]>,
+    include_mcp: bool,
+) -> Vec<Tool> {
+    let Some(tools) = response_tools else {
+        return Vec::new();
+    };
+
+    tools
+        .iter()
+        .filter_map(|rt| {
+            match rt.r#type {
+                // Function tools: Schema in request
+                ResponseToolType::Function => rt.function.as_ref().map(|f| Tool {
+                    tool_type: "function".to_string(),
+                    function: f.clone(),
+                }),
+                // MCP tools: Schema populated by convert_mcp_tools_to_response_tools()
+                // Only include if requested (Harmony case)
+                ResponseToolType::Mcp if include_mcp => rt.function.as_ref().map(|f| Tool {
+                    tool_type: "function".to_string(),
+                    function: f.clone(),
+                }),
+                // Hosted tools: No schema available, skip
+                _ => None,
+            }
+        })
+        .collect()
+}
--- a/sgl-router/src/routers/grpc/harmony/responses.rs
+++ b/sgl-router/src/routers/grpc/harmony/responses.rs
@@ -52,7 +52,7 @@ use crate::{
    data_connector::{ResponseId, ResponseStorage},
    mcp::{self, McpManager},
    protocols::{
-        common::{Function, ToolCall, Usage},
+        common::{Function, ToolCall, ToolChoice, ToolChoiceValue, Usage},
        responses::{
            McpToolInfo, ResponseContentPart, ResponseInput, ResponseInputOutputItem,
            ResponseOutputItem, ResponseReasoningContent, ResponseStatus, ResponseTool,
@@ -467,15 +467,6 @@ async fn execute_without_mcp_loop(
 /// - Calls `streaming::process_responses_iteration_stream()` for per-iteration events
 /// - Emits `response.completed` at end
 /// - Handles errors with `response.failed`
-///
-/// # Arguments
-///
-/// * `ctx` - Harmony responses context with pipeline and dependencies
-/// * `request` - Responses API request
-///
-/// # Returns
-///
-/// SSE stream response with proper headers
 pub async fn serve_harmony_responses_stream(
    ctx: &HarmonyResponsesContext,
    request: ResponsesRequest,
@@ -1189,6 +1180,11 @@ fn build_next_request_with_tools(
    // Update request with new items
    request.input = ResponseInput::Items(items);

+    // Switch tool_choice to "auto" for subsequent iterations
+    // This prevents infinite loops when original tool_choice was "required" or specific function
+    // After receiving tool results, the model should be free to decide whether to call more tools or finish
+    request.tool_choice = Some(ToolChoice::Value(ToolChoiceValue::Auto));
+
    Ok(request)
 }

@@ -1214,14 +1210,6 @@ struct ToolResult {
 ///
 /// Converts MCP Tool entries (from rmcp SDK) to ResponseTool format so the model
 /// knows about available MCP tools when making tool calls.
-///
-/// # Arguments
-///
-/// * `mcp_tools` - MCP tools from the MCP manager inventory (rmcp::model::Tool)
-///
-/// # Returns
-///
-/// Vector of ResponseTool entries in MCP format
 pub fn convert_mcp_tools_to_response_tools(mcp_tools: &[mcp::Tool]) -> Vec<ResponseTool> {
    mcp_tools
        .iter()

--- a/sgl-router/src/routers/grpc/harmony/stages/preparation.rs
+++ b/sgl-router/src/routers/grpc/harmony/stages/preparation.rs
@@ -12,7 +12,7 @@ use crate::{
        responses::ResponsesRequest,
    },
    routers::grpc::{
-        common::stages::PipelineStage,
+        common::{responses::utils::extract_tools_from_response_tools, stages::PipelineStage},
        context::{PreparationOutput, RequestContext, RequestType},
        error, utils,
    },
@@ -84,7 +84,7 @@ impl HarmonyPreparationStage {
        }

        // Step 1: Filter tools if needed
-        let body_ref = utils::filter_tools_for_request(request);
+        let body_ref = utils::filter_chat_request_by_tool_choice(request);

        // Step 2: Build tool constraints
        let tool_constraints = if let Some(tools) = body_ref.tools.as_ref() {
@@ -128,18 +128,37 @@ impl HarmonyPreparationStage {
        ctx: &mut RequestContext,
        request: &ResponsesRequest,
    ) -> Result<Option<Response>, Response> {
-        // Build via Harmony from responses API request
+        // Step 1: Extract function and MCP tools with schemas from ResponseTools
+        let mut function_tools = extract_tools_from_response_tools(request.tools.as_deref(), true);
+
+        // Step 2: Filter tools based on tool_choice (AllowedTools or Function)
+        // Note: Tool existence is already validated in ResponsesRequest::validate()
+        if let Some(filtered) =
+            utils::filter_tools_by_tool_choice(&function_tools, &request.tool_choice)
+        {
+            function_tools = filtered;
+        }
+
+        // Step 3: Generate Harmony structural tags from filtered tools
+        let tool_constraints = if !function_tools.is_empty() {
+            Self::generate_harmony_structural_tag(&function_tools, &request.tool_choice)
+                .map_err(|e| *e)?
+        } else {
+            None
+        };
+
+        // Step 3: Build via Harmony from responses API request
        let build_output = self
            .builder
            .build_from_responses(request)
            .map_err(|e| error::bad_request(format!("Harmony build failed: {}", e)))?;

-        // Store results in preparation output
+        // Step 4: Store results with tool_constraints
        ctx.state.preparation = Some(PreparationOutput {
            original_text: None,
            token_ids: build_output.input_ids,
            processed_messages: None,
-            tool_constraints: None,
+            tool_constraints,
            filtered_request: None,
            harmony_mode: true,
            selection_text: Some(build_output.selection_text),

--- a/sgl-router/src/routers/grpc/harmony/stages/request_building.rs
+++ b/sgl-router/src/routers/grpc/harmony/stages/request_building.rs
@@ -84,6 +84,7 @@ impl PipelineStage for HarmonyRequestBuildingStage {
                    placeholder_processed_text,
                    prep.token_ids.clone(),
                    prep.harmony_stop_ids.clone(),
+                    prep.tool_constraints.clone(),
                )
                .map_err(|e| error::bad_request(format!("Invalid request parameters: {}", e)))?,
            _ => unreachable!(),

--- a/sgl-router/src/routers/grpc/regular/responses/conversions.rs
+++ b/sgl-router/src/routers/grpc/regular/responses/conversions.rs
@@ -7,14 +7,17 @@
 //! This allows the gRPC router to reuse the existing chat pipeline infrastructure
 //! without requiring Python backend changes.

-use crate::protocols::{
-    chat::{ChatCompletionRequest, ChatCompletionResponse, ChatMessage, UserMessageContent},
-    common::{FunctionCallResponse, StreamOptions, ToolCall, UsageInfo},
-    responses::{
-        ResponseContentPart, ResponseInput, ResponseInputOutputItem, ResponseOutputItem,
-        ResponseReasoningContent::ReasoningText, ResponseStatus, ResponsesRequest,
-        ResponsesResponse, ResponsesUsage, StringOrContentParts,
+use crate::{
+    protocols::{
+        chat::{ChatCompletionRequest, ChatCompletionResponse, ChatMessage, UserMessageContent},
+        common::{FunctionCallResponse, StreamOptions, ToolCall, ToolChoice, UsageInfo},
+        responses::{
+            ResponseContentPart, ResponseInput, ResponseInputOutputItem, ResponseOutputItem,
+            ResponseReasoningContent::ReasoningText, ResponseStatus, ResponsesRequest,
+            ResponsesResponse, ResponsesUsage, StringOrContentParts,
+        },
    },
+    routers::grpc::common::responses::utils::extract_tools_from_response_tools,
 };

 /// Convert a ResponsesRequest to ChatCompletionRequest for processing through the chat pipeline
@@ -23,7 +26,8 @@ use crate::protocols::{
 /// - `input` (text/items) → `messages` (chat messages)
 /// - `instructions` → system message (prepended)
 /// - `max_output_tokens` → `max_completion_tokens`
-/// - Tool-related fields are passed through
+/// - `tools` → function tools extracted from ResponseTools
+/// - `tool_choice` → passed through from request
 /// - Response-specific fields (previous_response_id, conversation) are handled by router
 pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest, String> {
    let mut messages = Vec::new();
@@ -68,69 +72,13 @@ pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest
                            }
                        };

-                        match role.as_str() {
-                            "user" => {
-                                messages.push(ChatMessage::User {
-                                    content: UserMessageContent::Text(text),
-                                    name: None,
-                                });
-                            }
-                            "assistant" => {
-                                messages.push(ChatMessage::Assistant {
-                                    content: Some(text),
-                                    name: None,
-                                    tool_calls: None,
-                                    reasoning_content: None,
-                                });
-                            }
-                            "system" => {
-                                messages.push(ChatMessage::System {
-                                    content: text,
-                                    name: None,
-                                });
-                            }
-                            _ => {
-                                // Unknown role, treat as user message
-                                messages.push(ChatMessage::User {
-                                    content: UserMessageContent::Text(text),
-                                    name: None,
-                                });
-                            }
-                        }
+                        messages.push(role_to_chat_message(role.as_str(), text));
                    }
                    ResponseInputOutputItem::Message { role, content, .. } => {
                        // Extract text from content parts
                        let text = extract_text_from_content(content);

-                        match role.as_str() {
-                            "user" => {
-                                messages.push(ChatMessage::User {
-                                    content: UserMessageContent::Text(text),
-                                    name: None,
-                                });
-                            }
-                            "assistant" => {
-                                messages.push(ChatMessage::Assistant {
-                                    content: Some(text),
-                                    name: None,
-                                    tool_calls: None,
-                                    reasoning_content: None,
-                                });
-                            }
-                            "system" => {
-                                messages.push(ChatMessage::System {
-                                    content: text,
-                                    name: None,
-                                });
-                            }
-                            _ => {
-                                // Unknown role, treat as user message
-                                messages.push(ChatMessage::User {
-                                    content: UserMessageContent::Text(text),
-                                    name: None,
-                                });
-                            }
-                        }
+                        messages.push(role_to_chat_message(role.as_str(), text));
                    }
                    ResponseInputOutputItem::FunctionToolCall {
                        id,
@@ -203,7 +151,18 @@ pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest
        return Err("Request must contain at least one message".to_string());
    }

-    // 3. Build ChatCompletionRequest
+    // 3. Extract function tools from ResponseTools
+    // Only function tools are extracted here (include_mcp: false).
+    // MCP tools are merged later by the tool loop (see tool_loop.rs:prepare_chat_tools_and_choice)
+    // before the chat pipeline, where tool_choice constraints are applied to ALL tools combined.
+    let function_tools = extract_tools_from_response_tools(req.tools.as_deref(), false);
+    let tools = if function_tools.is_empty() {
+        None
+    } else {
+        Some(function_tools)
+    };
+
+    // 4. Build ChatCompletionRequest
    let is_streaming = req.stream.unwrap_or(false);

    Ok(ChatCompletionRequest {
@@ -227,9 +186,8 @@ pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest
        top_logprobs: req.top_logprobs,
        top_p: req.top_p,
        skip_special_tokens: true,
-        // Note: tools and tool_choice will be handled separately for MCP transformation
-        tools: None,       // Will be set by caller if needed
-        tool_choice: None, // Will be set by caller if needed
+        tools,
+        tool_choice: req.tool_choice.clone(),
        ..Default::default()
    })
 }
@@ -247,6 +205,33 @@ fn extract_text_from_content(content: &[ResponseContentPart]) -> String {
        .join("")
 }

+/// Convert role and text to ChatMessage
+fn role_to_chat_message(role: &str, text: String) -> ChatMessage {
+    match role {
+        "user" => ChatMessage::User {
+            content: UserMessageContent::Text(text),
+            name: None,
+        },
+        "assistant" => ChatMessage::Assistant {
+            content: Some(text),
+            name: None,
+            tool_calls: None,
+            reasoning_content: None,
+        },
+        "system" => ChatMessage::System {
+            content: text,
+            name: None,
+        },
+        _ => {
+            // Unknown role, treat as user message
+            ChatMessage::User {
+                content: UserMessageContent::Text(text),
+                name: None,
+            }
+        }
+    }
+}
+
 /// Convert a ChatCompletionResponse to ResponsesResponse
 ///
 /// # Conversion Logic
@@ -354,7 +339,7 @@ pub fn chat_to_responses(
        store: original_req.store.unwrap_or(true),
        temperature: original_req.temperature,
        text: None,
-        tool_choice: "auto".to_string(), // TODO: Map from original request
+        tool_choice: ToolChoice::serialize_to_string(&original_req.tool_choice),
        tools: original_req.tools.clone().unwrap_or_default(),
        top_p: original_req.top_p,
        truncation: None,

--- a/sgl-router/src/routers/grpc/regular/responses/handlers.rs
+++ b/sgl-router/src/routers/grpc/regular/responses/handlers.rs
@@ -58,7 +58,7 @@ use crate::{
    },
    protocols::{
        chat::{self, ChatCompletionStreamResponse},
-        common,
+        common::{self, ToolChoice},
        responses::{
            self, ResponseContentPart, ResponseInput, ResponseInputOutputItem, ResponseOutputItem,
            ResponseReasoningContent, ResponseStatus, ResponsesRequest, ResponsesResponse,
@@ -657,7 +657,7 @@ impl StreamingResponseAccumulator {
            store: self.original_request.store.unwrap_or(true),
            temperature: self.original_request.temperature,
            text: None,
-            tool_choice: "auto".to_string(),
+            tool_choice: ToolChoice::serialize_to_string(&self.original_request.tool_choice),
            tools: self.original_request.tools.clone().unwrap_or_default(),
            top_p: self.original_request.top_p,
            truncation: None,

--- a/sgl-router/src/routers/grpc/regular/responses/tool_loop.rs
+++ b/sgl-router/src/routers/grpc/regular/responses/tool_loop.rs
@@ -13,7 +13,7 @@ use axum::{
 };
 use bytes::Bytes;
 use futures_util::StreamExt;
-use serde_json::json;
+use serde_json::{json, Value};
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, warn};
@@ -24,7 +24,8 @@ use crate::{
    mcp::{self, McpManager},
    protocols::{
        chat::{
-            ChatChoice, ChatCompletionMessage, ChatCompletionResponse, ChatCompletionStreamResponse,
+            ChatChoice, ChatCompletionMessage, ChatCompletionRequest, ChatCompletionResponse,
+            ChatCompletionStreamResponse,
        },
        common::{Function, FunctionCallResponse, Tool, ToolCall, ToolChoice, ToolChoiceValue},
        responses::{
@@ -66,6 +67,30 @@ fn extract_function_call_from_chat(
    None
 }

+/// Merge function tools from request with MCP tools and set tool_choice based on iteration
+fn prepare_chat_tools_and_choice(
+    chat_request: &mut ChatCompletionRequest,
+    mcp_chat_tools: &[Tool],
+    iteration: usize,
+) {
+    // Merge function tools from request with MCP tools
+    let mut all_tools = chat_request.tools.clone().unwrap_or_default();
+    all_tools.extend(mcp_chat_tools.iter().cloned());
+    chat_request.tools = Some(all_tools);
+
+    // Set tool_choice based on iteration
+    // - Iteration 0: Use user's tool_choice or default to auto
+    // - Iteration 1+: Always use auto to avoid infinite loops
+    chat_request.tool_choice = if iteration == 0 {
+        chat_request
+            .tool_choice
+            .clone()
+            .or(Some(ToolChoice::Value(ToolChoiceValue::Auto)))
+    } else {
+        Some(ToolChoice::Value(ToolChoiceValue::Auto))
+    };
+}
+
 /// Extract all tool calls from chat response (for parallel tool call support)
 fn extract_all_tool_calls_from_chat(
    response: &ChatCompletionResponse,
@@ -166,16 +191,13 @@ fn build_mcp_list_tools_item(mcp: &Arc<McpManager>, server_label: &str) -> Respo
    let tools = mcp.list_tools();
    let tools_info: Vec<McpToolInfo> = tools
        .iter()
-        .map(|t| {
-            use serde_json::Value;
-            McpToolInfo {
-                name: t.name.to_string(),
-                description: t.description.as_ref().map(|d| d.to_string()),
-                input_schema: Value::Object((*t.input_schema).clone()),
-                annotations: Some(json!({
-                    "read_only": false
-                })),
-            }
+        .map(|t| McpToolInfo {
+            name: t.name.to_string(),
+            description: t.description.as_ref().map(|d| d.to_string()),
+            input_schema: Value::Object((*t.input_schema).clone()),
+            annotations: Some(json!({
+                "read_only": false
+            })),
        })
        .collect();

@@ -247,17 +269,19 @@ pub(super) async fn execute_tool_loop(

    // Get MCP tools and convert to chat format (do this once before loop)
    let mcp_tools = ctx.mcp_manager.list_tools();
-    let chat_tools = convert_mcp_tools_to_chat_tools(&mcp_tools);
-    debug!("Converted {} MCP tools to chat format", chat_tools.len());
+    let mcp_chat_tools = convert_mcp_tools_to_chat_tools(&mcp_tools);
+    debug!(
+        "Converted {} MCP tools to chat format",
+        mcp_chat_tools.len()
+    );

    loop {
        // Convert to chat request
        let mut chat_request = conversions::responses_to_chat(&current_request)
            .map_err(|e| error::bad_request(format!("Failed to convert request: {}", e)))?;

-        // Add MCP tools to chat request so LLM knows about them
-        chat_request.tools = Some(chat_tools.clone());
-        chat_request.tool_choice = Some(ToolChoice::Value(ToolChoiceValue::Auto));
+        // Prepare tools and tool_choice for this iteration
+        prepare_chat_tools_and_choice(&mut chat_request, &mcp_chat_tools, state.iteration);

        // Execute chat pipeline (errors already have proper HTTP status codes)
        let chat_response = ctx
@@ -555,10 +579,10 @@ async fn execute_tool_loop_streaming_internal(

    // Get MCP tools and convert to chat format (do this once before loop)
    let mcp_tools = ctx.mcp_manager.list_tools();
-    let chat_tools = convert_mcp_tools_to_chat_tools(&mcp_tools);
+    let mcp_chat_tools = convert_mcp_tools_to_chat_tools(&mcp_tools);
    debug!(
        "Streaming: Converted {} MCP tools to chat format",
-        chat_tools.len()
+        mcp_chat_tools.len()
    );

    // Flag to track if mcp_list_tools has been emitted
@@ -584,7 +608,6 @@ async fn execute_tool_loop_streaming_internal(
            let tool_items: Vec<_> = mcp_tools
                .iter()
                .map(|t| {
-                    use serde_json::Value;
                    json!({
                        "name": t.name,
                        "description": t.description,
@@ -635,9 +658,8 @@ async fn execute_tool_loop_streaming_internal(
        let mut chat_request = conversions::responses_to_chat(&current_request)
            .map_err(|e| format!("Failed to convert request: {}", e))?;

-        // Add MCP tools to chat request so LLM knows about them
-        chat_request.tools = Some(chat_tools.clone());
-        chat_request.tool_choice = Some(ToolChoice::Value(ToolChoiceValue::Auto));
+        // Prepare tools and tool_choice for this iteration (same logic as non-streaming)
+        prepare_chat_tools_and_choice(&mut chat_request, &mcp_chat_tools, state.iteration);

        // Execute chat streaming
        let response = ctx
@@ -913,7 +935,6 @@ async fn execute_tool_loop_streaming_internal(

 /// Convert MCP tools to Chat API tool format
 fn convert_mcp_tools_to_chat_tools(mcp_tools: &[mcp::Tool]) -> Vec<Tool> {
-    use serde_json::Value;
    mcp_tools
        .iter()
        .map(|tool_info| Tool {

--- a/sgl-router/src/routers/grpc/regular/stages/chat/preparation.rs
+++ b/sgl-router/src/routers/grpc/regular/stages/chat/preparation.rs
@@ -40,7 +40,7 @@ impl ChatPreparationStage {
        request: &ChatCompletionRequest,
    ) -> Result<(), Response> {
        // Step 1: Filter tools if needed
-        let body_ref = utils::filter_tools_for_request(request);
+        let body_ref = utils::filter_chat_request_by_tool_choice(request);

        // Step 2: Process messages and apply chat template
        let processed_messages =

--- a/sgl-router/src/routers/grpc/utils.rs
+++ b/sgl-router/src/routers/grpc/utils.rs
@@ -9,7 +9,6 @@ use tracing::{error, warn};
 use uuid::Uuid;

 use super::{error, ProcessedMessages};
-pub use crate::tokenizer::StopSequenceDecoder;
 use crate::{
    core::Worker,
    grpc_client::{proto, sglang_scheduler::AbortOnDropStream, SglangSchedulerClient},
@@ -28,8 +27,9 @@ use crate::{
    tokenizer::{
        cache::CachedTokenizer,
        chat_template::{ChatTemplateContentFormat, ChatTemplateParams},
+        stop::StopSequenceDecoderBuilder,
        traits::Tokenizer,
-        HuggingFaceTokenizer,
+        HuggingFaceTokenizer, StopSequenceDecoder,
    },
    tool_parser::{
        ParserFactory as ToolParserFactory, PooledParser as ToolPooledParser, ToolParser,
@@ -273,39 +273,57 @@ fn build_required_array_schema(tools: &[Tool]) -> Result<String, String> {
        .map_err(|e| format!("Failed to serialize tool schema: {}", e))
 }

-/// Filter tools based on tool_choice (shared by both routers)
-/// Returns a reference to the original body if no filtering needed,
-/// otherwise returns a cloned and filtered body
-pub fn filter_tools_for_request(
-    body: &ChatCompletionRequest,
-) -> std::borrow::Cow<'_, ChatCompletionRequest> {
-    match &body.tool_choice {
-        Some(ToolChoice::AllowedTools { tools: allowed, .. }) if body.tools.is_some() => {
-            let mut filtered_body = body.clone();
-            let all_tools = filtered_body.tools.as_ref().unwrap();
+/// Filter tools based on tool_choice (generic helper)
+///
+/// Returns filtered tools if filtering is needed, otherwise returns None.
+/// Used by both Chat API and Responses API (Harmony) for constraint generation.
+pub fn filter_tools_by_tool_choice(
+    tools: &[Tool],
+    tool_choice: &Option<ToolChoice>,
+) -> Option<Vec<Tool>> {
+    match tool_choice {
+        Some(ToolChoice::AllowedTools { tools: allowed, .. }) => {
            let allowed_names: std::collections::HashSet<&str> =
-                allowed.iter().map(|t| t.name.as_str()).collect();
-            let filtered_tools: Vec<Tool> = all_tools
+                allowed.iter().filter_map(|t| t.function_name()).collect();
+            let filtered: Vec<Tool> = tools
                .iter()
                .filter(|t| allowed_names.contains(t.function.name.as_str()))
                .cloned()
                .collect();
-            filtered_body.tools = Some(filtered_tools);
-            std::borrow::Cow::Owned(filtered_body)
+            Some(filtered)
        }
-        Some(ToolChoice::Function { function, .. }) if body.tools.is_some() => {
-            let mut filtered_body = body.clone();
-            let all_tools = filtered_body.tools.as_ref().unwrap();
-            let filtered_tools: Vec<Tool> = all_tools
+        Some(ToolChoice::Function { function, .. }) => {
+            let filtered: Vec<Tool> = tools
                .iter()
                .filter(|t| t.function.name == function.name)
                .cloned()
                .collect();
+            Some(filtered)
+        }
+        _ => None, // No filtering needed
+    }
+}
+
+/// Filter ChatCompletionRequest by tool_choice
+///
+/// Returns a reference to the original request if no filtering needed,
+/// otherwise returns a cloned request with filtered tools.
+///
+/// Note: Tool existence is validated earlier in ChatCompletionRequest::validate(),
+/// so this function assumes tool_choice references valid tools.
+pub fn filter_chat_request_by_tool_choice(
+    body: &ChatCompletionRequest,
+) -> std::borrow::Cow<'_, ChatCompletionRequest> {
+    if let Some(tools) = &body.tools {
+        if let Some(filtered_tools) = filter_tools_by_tool_choice(tools, &body.tool_choice) {
+            let mut filtered_body = body.clone();
            filtered_body.tools = Some(filtered_tools);
-            std::borrow::Cow::Owned(filtered_body)
+            return std::borrow::Cow::Owned(filtered_body);
        }
-        _ => std::borrow::Cow::Borrowed(body), // No filtering needed, use original
    }
+
+    // No filtering needed - return original request
+    std::borrow::Cow::Borrowed(body)
 }

 /// Process chat messages and apply template (shared by both routers)
@@ -438,8 +456,6 @@ pub fn create_stop_decoder(
    skip_special_tokens: bool,
    no_stop_trim: bool,
 ) -> StopSequenceDecoder {
-    use crate::tokenizer::stop::StopSequenceDecoderBuilder;
-
    // Extract stop sequences
    let stop_sequences: Vec<String> = match stop {
        Some(StringOrArray::String(s)) => vec![s.clone()],

--- a/sgl-router/tests/spec/chat_completion.rs
+++ b/sgl-router/tests/spec/chat_completion.rs
@@ -349,8 +349,7 @@ fn test_tool_choice_allowed_tools_invalid_mode() {
        }]),
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "invalid_mode".to_string(),
-            tools: vec![ToolReference {
-                tool_type: "function".to_string(),
+            tools: vec![ToolReference::Function {
                name: "get_weather".to_string(),
            }],
            tool_type: "function".to_string(),
@@ -387,8 +386,7 @@ fn test_tool_choice_allowed_tools_valid_mode_auto() {
        }]),
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "auto".to_string(),
-            tools: vec![ToolReference {
-                tool_type: "function".to_string(),
+            tools: vec![ToolReference::Function {
                name: "get_weather".to_string(),
            }],
            tool_type: "function".to_string(),
@@ -419,8 +417,7 @@ fn test_tool_choice_allowed_tools_valid_mode_required() {
        }]),
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "required".to_string(),
-            tools: vec![ToolReference {
-                tool_type: "function".to_string(),
+            tools: vec![ToolReference::Function {
                name: "get_weather".to_string(),
            }],
            tool_type: "function".to_string(),
@@ -451,8 +448,7 @@ fn test_tool_choice_allowed_tools_tool_not_found() {
        }]),
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "auto".to_string(),
-            tools: vec![ToolReference {
-                tool_type: "function".to_string(),
+            tools: vec![ToolReference::Function {
                name: "nonexistent_tool".to_string(),
            }],
            tool_type: "function".to_string(),
@@ -501,12 +497,10 @@ fn test_tool_choice_allowed_tools_multiple_tools_valid() {
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "auto".to_string(),
            tools: vec![
-                ToolReference {
-                    tool_type: "function".to_string(),
+                ToolReference::Function {
                    name: "get_weather".to_string(),
                },
-                ToolReference {
-                    tool_type: "function".to_string(),
+                ToolReference::Function {
                    name: "get_time".to_string(),
                },
            ],
@@ -550,12 +544,10 @@ fn test_tool_choice_allowed_tools_one_invalid_among_valid() {
        tool_choice: Some(ToolChoice::AllowedTools {
            mode: "auto".to_string(),
            tools: vec![
-                ToolReference {
-                    tool_type: "function".to_string(),
+                ToolReference::Function {
                    name: "get_weather".to_string(),
                },
-                ToolReference {
-                    tool_type: "function".to_string(),
+                ToolReference::Function {
                    name: "nonexistent_tool".to_string(),
                },
            ],