[router][grpc] Fix model validation, tool call check, streaming logic and misc...

[router][grpc] Fix model validation, tool call check, streaming logic and misc in responses (#12616)

[router][grpc] Fix model validation, tool call check, streaming logic and misc...
[router][grpc] Fix model validation, tool call check, streaming logic and misc in responses (#12616)
0e82fd3d · Chang Su · GitHub · b7d70411 · 0e82fd3d · 0e82fd3d
Unverified Commit 0e82fd3d authored Nov 04, 2025 by Chang Su Committed by GitHub Nov 04, 2025
11 changed files
--- a/sgl-router/src/routers/grpc/common/responses/mod.rs
+++ b/sgl-router/src/routers/grpc/common/responses/mod.rs
@@ -2,6 +2,8 @@

 pub mod handlers;
 pub mod streaming;
+pub mod utils;

 pub use handlers::{cancel_response_impl, get_response_impl};
-pub use streaming::{OutputItemType, ResponseStreamEventEmitter};
+pub use streaming::{build_sse_response, OutputItemType, ResponseStreamEventEmitter};
+pub use utils::ensure_mcp_connection;
--- a/sgl-router/src/routers/grpc/common/responses/streaming.rs
+++ b/sgl-router/src/routers/grpc/common/responses/streaming.rs
@@ -2,9 +2,11 @@

 use std::collections::HashMap;

+use axum::{body::Body, http::StatusCode, response::Response};
 use bytes::Bytes;
 use serde_json::json;
 use tokio::sync::mpsc;
+use tokio_stream::wrappers::UnboundedReceiverStream;
 use uuid::Uuid;

 use crate::{mcp, protocols::chat::ChatCompletionStreamResponse};
@@ -13,6 +15,7 @@ pub enum OutputItemType {
    Message,
    McpListTools,
    McpCall,
+    FunctionCall,
    Reasoning,
 }

@@ -342,6 +345,40 @@ impl ResponseStreamEventEmitter {
        })
    }

+    // ========================================================================
+    // Function Call Event Emission Methods
+    // ========================================================================
+
+    pub fn emit_function_call_arguments_delta(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        delta: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.function_call_arguments.delta",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "delta": delta
+        })
+    }
+
+    pub fn emit_function_call_arguments_done(
+        &mut self,
+        output_index: usize,
+        item_id: &str,
+        arguments: &str,
+    ) -> serde_json::Value {
+        json!({
+            "type": "response.function_call_arguments.done",
+            "sequence_number": self.next_sequence(),
+            "output_index": output_index,
+            "item_id": item_id,
+            "arguments": arguments
+        })
+    }
+
    // ========================================================================
    // Output Item Wrapper Events
    // ========================================================================
@@ -387,6 +424,7 @@ impl ResponseStreamEventEmitter {
        let id_prefix = match &item_type {
            OutputItemType::McpListTools => "mcpl",
            OutputItemType::McpCall => "mcp",
+            OutputItemType::FunctionCall => "fc",
            OutputItemType::Message => "msg",
            OutputItemType::Reasoning => "rs",
        };
@@ -582,4 +620,40 @@ impl ResponseStreamEventEmitter {
            }
        }
    }
+
+    /// Emit an error event
+    ///
+    /// Creates and sends an error event with the given error message.
+    /// Uses OpenAI's error event format.
+    /// Use this for terminal errors that should abort the streaming response.
+    pub fn emit_error(
+        &mut self,
+        error_msg: &str,
+        error_code: Option<&str>,
+        tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+    ) {
+        let event = json!({
+            "type": "error",
+            "code": error_code.unwrap_or("internal_error"),
+            "message": error_msg,
+            "param": null,
+            "sequence_number": self.next_sequence()
+        });
+        let sse_data = format!("data: {}\n\n", serde_json::to_string(&event).unwrap());
+        let _ = tx.send(Ok(Bytes::from(sse_data)));
+    }
+}
+
+/// Build a Server-Sent Events (SSE) response
+///
+/// Creates a Response with proper SSE headers and streaming body.
+pub fn build_sse_response(rx: mpsc::UnboundedReceiver<Result<Bytes, std::io::Error>>) -> Response {
+    let stream = UnboundedReceiverStream::new(rx);
+    Response::builder()
+        .status(StatusCode::OK)
+        .header("Content-Type", "text/event-stream")
+        .header("Cache-Control", "no-cache")
+        .header("Connection", "keep-alive")
+        .body(Body::from_stream(stream))
+        .unwrap()
 }
--- a/sgl-router/src/routers/grpc/common/responses/utils.rs
+++ b/sgl-router/src/routers/grpc/common/responses/utils.rs
+//! Utility functions for /v1/responses endpoint
+
+use std::sync::Arc;
+
+use axum::{
+    http::StatusCode,
+    response::{IntoResponse, Response},
+};
+use serde_json::json;
+
+use crate::{
+    core::WorkerRegistry,
+    mcp::McpManager,
+    protocols::responses::{ResponseTool, ResponseToolType},
+    routers::{grpc::error, openai::mcp::ensure_request_mcp_client},
+};
+
+/// Ensure MCP connection succeeds if MCP tools are declared
+///
+/// Checks if request declares MCP tools, and if so, validates that
+/// the MCP client can be created and connected.
+pub async fn ensure_mcp_connection(
+    mcp_manager: &Arc<McpManager>,
+    tools: Option<&[ResponseTool]>,
+) -> Result<bool, Response> {
+    let has_mcp_tools = tools
+        .map(|t| {
+            t.iter()
+                .any(|tool| matches!(tool.r#type, ResponseToolType::Mcp))
+        })
+        .unwrap_or(false);
+
+    if has_mcp_tools {
+        if let Some(tools) = tools {
+            if ensure_request_mcp_client(mcp_manager, tools)
+                .await
+                .is_none()
+            {
+                return Err(error::failed_dependency(
+                    "Failed to connect to MCP server. Check server_url and authorization.",
+                ));
+            }
+        }
+    }
+
+    Ok(has_mcp_tools)
+}
+
+/// Validate that workers are available for the requested model
+pub fn validate_worker_availability(
+    worker_registry: &Arc<WorkerRegistry>,
+    model: &str,
+) -> Option<Response> {
+    let available_models = worker_registry.get_models();
+
+    if !available_models.contains(&model.to_string()) {
+        return Some(
+            (
+                StatusCode::SERVICE_UNAVAILABLE,
+                axum::Json(json!({
+                    "error": {
+                        "message": format!(
+                            "No workers available for model '{}'. Available models: {}",
+                            model,
+                            available_models.join(", ")
+                        ),
+                        "type": "service_unavailable",
+                        "param": "model",
+                        "code": "no_available_workers"
+                    }
+                })),
+            )
+                .into_response(),
+        );
+    }
+
+    None
+}
--- a/sgl-router/src/routers/grpc/error.rs
+++ b/sgl-router/src/routers/grpc/error.rs
@@ -107,6 +107,30 @@ pub fn service_unavailable(message: impl Into<String>) -> Response {
        .into_response()
 }

+/// Create a 424 Failed Dependency response
+///
+/// Use this when an external dependency (like MCP server) fails.
+///
+/// # Example
+/// ```ignore
+/// return Err(failed_dependency("Failed to connect to MCP server"));
+/// ```
+pub fn failed_dependency(message: impl Into<String>) -> Response {
+    let msg = message.into();
+    warn!("{}", msg);
+    (
+        StatusCode::FAILED_DEPENDENCY,
+        Json(json!({
+            "error": {
+                "message": msg,
+                "type": "external_connector_error",
+                "code": 424
+            }
+        })),
+    )
+        .into_response()
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

--- a/sgl-router/src/routers/grpc/harmony/processor.rs
+++ b/sgl-router/src/routers/grpc/harmony/processor.rs
@@ -124,8 +124,10 @@ pub enum ResponsesIterationResult {
    /// Tool calls found in commentary channel - continue MCP loop
    ToolCallsFound {
        tool_calls: Vec<ToolCall>,
-        analysis: Option<String>, // For streaming emission
-        partial_text: String,     // For streaming emission
+        analysis: Option<String>, // For streaming emission or reasoning output
+        partial_text: String,     // For streaming emission or message output
+        usage: Usage,             // Token usage from this iteration
+        request_id: String,       // Request ID from dispatch
    },
    /// No tool calls - return final ResponsesResponse
    Completed {
@@ -206,6 +208,9 @@ impl HarmonyResponseProcessor {
            );
        }

+        // Build usage (needed for both ToolCallsFound and Completed)
+        let usage = response_formatting::build_usage(std::slice::from_ref(complete));
+
        // Check for tool calls in commentary channel
        if let Some(tool_calls) = parsed.commentary {
            // Tool calls found - return for MCP loop execution
@@ -213,6 +218,8 @@ impl HarmonyResponseProcessor {
                tool_calls,
                analysis: parsed.analysis,
                partial_text: parsed.final_text,
+                usage,
+                request_id: dispatch.request_id.clone(),
            });
        }

@@ -245,9 +252,6 @@ impl HarmonyResponseProcessor {
            output.push(message_item);
        }

-        // Build usage
-        let usage = response_formatting::build_usage(std::slice::from_ref(complete));
-
        // Build ResponsesResponse with all required fields
        let response = ResponsesResponse {
            id: dispatch.request_id.clone(),

--- a/sgl-router/src/routers/grpc/harmony/responses.rs
+++ b/sgl-router/src/routers/grpc/harmony/responses.rs
@@ -37,16 +37,14 @@
 //! for complete architecture, rationale, and implementation details.

 use std::{
-    io,
    sync::Arc,
    time::{SystemTime, UNIX_EPOCH},
 };

-use axum::{body::Body, http::StatusCode, response::Response};
+use axum::response::Response;
 use bytes::Bytes;
 use serde_json::{from_str, from_value, json, to_string, to_value, Value};
 use tokio::sync::mpsc;
-use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, warn};
 use uuid::Uuid;

@@ -54,22 +52,23 @@ use crate::{
    data_connector::{ResponseId, ResponseStorage},
    mcp::{self, McpManager},
    protocols::{
-        common::{Function, ToolCall},
+        common::{Function, ToolCall, Usage},
        responses::{
            McpToolInfo, ResponseContentPart, ResponseInput, ResponseInputOutputItem,
-            ResponseOutputItem, ResponseReasoningContent, ResponseTool, ResponseToolType,
-            ResponsesRequest, ResponsesResponse, StringOrContentParts,
+            ResponseOutputItem, ResponseReasoningContent, ResponseStatus, ResponseTool,
+            ResponseToolType, ResponseUsage, ResponsesRequest, ResponsesResponse, ResponsesUsage,
+            StringOrContentParts,
        },
    },
-    routers::{
-        grpc::{
-            common::responses::streaming::{OutputItemType, ResponseStreamEventEmitter},
-            context::SharedComponents,
-            error,
-            harmony::processor::ResponsesIterationResult,
-            pipeline::RequestPipeline,
+    routers::grpc::{
+        common::responses::{
+            build_sse_response, ensure_mcp_connection,
+            streaming::{OutputItemType, ResponseStreamEventEmitter},
        },
-        openai::mcp::ensure_request_mcp_client,
+        context::SharedComponents,
+        error,
+        harmony::{processor::ResponsesIterationResult, streaming::HarmonyStreamingProcessor},
+        pipeline::RequestPipeline,
    },
 };

@@ -239,48 +238,47 @@ pub async fn serve_harmony_responses(
    request: ResponsesRequest,
 ) -> Result<ResponsesResponse, Response> {
    // Load previous conversation history if previous_response_id is set
-    let mut current_request = load_previous_messages(ctx, request).await?;
-    let mut iteration_count = 0;
+    let current_request = load_previous_messages(ctx, request).await?;

-    let has_mcp_tools = current_request
-        .tools
-        .as_ref()
-        .map(|tools| {
-            tools
-                .iter()
-                .any(|t| matches!(t.r#type, ResponseToolType::Mcp))
-        })
-        .unwrap_or(false);
+    // Check MCP connection and get whether MCP tools are present
+    let has_mcp_tools =
+        ensure_mcp_connection(&ctx.mcp_manager, current_request.tools.as_deref()).await?;

-    // Initialize MCP call tracking (will be passed to processor for final response)
-    let mut mcp_tracking = if has_mcp_tools {
-        Some(McpCallTracking::new("sglang-mcp".to_string()))
+    if has_mcp_tools {
+        execute_with_mcp_loop(ctx, current_request).await
    } else {
-        None
-    };
+        // No MCP tools - execute pipeline once (may have function tools or no tools)
+        execute_without_mcp_loop(ctx, current_request).await
+    }
+}

-    if has_mcp_tools {
-        // Ensure dynamic MCP client is registered for request-scoped tools
-        if let Some(tools) = &current_request.tools {
-            ensure_request_mcp_client(&ctx.mcp_manager, tools).await;
-        }
+/// Execute Harmony Responses with MCP tool loop
+///
+/// Automatically executes MCP tools in a loop until no more tool calls or max iterations
+async fn execute_with_mcp_loop(
+    ctx: &HarmonyResponsesContext,
+    mut current_request: ResponsesRequest,
+) -> Result<ResponsesResponse, Response> {
+    let mut iteration_count = 0;
+    let mut mcp_tracking = McpCallTracking::new("sglang-mcp".to_string());

-        // Add static MCP tools from inventory to the request
-        // (similar to non-Harmony pipeline pattern)
-        let mcp_tools = ctx.mcp_manager.list_tools();
-        if !mcp_tools.is_empty() {
-            let mcp_response_tools = convert_mcp_tools_to_response_tools(&mcp_tools);
+    // Extract user's max_tool_calls limit (if set)
+    let max_tool_calls = current_request.max_tool_calls.map(|n| n as usize);

-            let mut all_tools = current_request.tools.clone().unwrap_or_default();
-            all_tools.extend(mcp_response_tools);
-            current_request.tools = Some(all_tools);
+    // Add static MCP tools from inventory to the request
+    let mcp_tools = ctx.mcp_manager.list_tools();
+    if !mcp_tools.is_empty() {
+        let mcp_response_tools = convert_mcp_tools_to_response_tools(&mcp_tools);

-            debug!(
-                mcp_tool_count = mcp_tools.len(),
-                total_tool_count = current_request.tools.as_ref().map(|t| t.len()).unwrap_or(0),
-                "Request has MCP tools - added static MCP tools to Harmony Responses request"
-            );
-        }
+        let mut all_tools = current_request.tools.clone().unwrap_or_default();
+        all_tools.extend(mcp_response_tools);
+        current_request.tools = Some(all_tools);
+
+        debug!(
+            mcp_tool_count = mcp_tools.len(),
+            total_tool_count = current_request.tools.as_ref().map(|t| t.len()).unwrap_or(0),
+            "MCP client available - added static MCP tools to Harmony Responses request"
+        );
    }

    loop {
@@ -317,30 +315,60 @@ pub async fn serve_harmony_responses(
                tool_calls,
                analysis,
                partial_text,
+                usage,
+                request_id,
            } => {
                debug!(
                    tool_call_count = tool_calls.len(),
                    has_analysis = analysis.is_some(),
                    partial_text_len = partial_text.len(),
-                    "Tool calls found in commentary channel"
+                    "Tool calls found - checking limits before executing MCP tools"
                );

-                // TODO: Streaming support - emit intermediate chunks
-                // if let Some(tx) = &ctx.stream_tx {
-                //     emit_intermediate_chunks(tx, &analysis, &partial_text, iteration_count).await?;
-                // }
-
-                // Execute MCP tools via MCP manager
-                // If tools don't exist, call_tool() will return error naturally
-                let tool_results = if let Some(ref mut tracking) = mcp_tracking {
-                    execute_mcp_tools(&ctx.mcp_manager, &tool_calls, tracking).await?
-                } else {
-                    // Should never happen (we only get tool_calls when has_mcp_tools=true)
-                    return Err(error::internal_error(
-                        "Tool calls found but MCP tracking not initialized",
-                    ));
+                // Check combined limit (user's max_tool_calls vs safety limit)
+                let effective_limit = match max_tool_calls {
+                    Some(user_max) => user_max.min(MAX_TOOL_ITERATIONS),
+                    None => MAX_TOOL_ITERATIONS,
                };

+                // Check if we would exceed the limit with these new tool calls
+                let total_calls_after = mcp_tracking.total_calls() + tool_calls.len();
+                if total_calls_after > effective_limit {
+                    warn!(
+                        current_calls = mcp_tracking.total_calls(),
+                        new_calls = tool_calls.len(),
+                        total_after = total_calls_after,
+                        effective_limit = effective_limit,
+                        user_max = ?max_tool_calls,
+                        "Reached tool call limit - returning incomplete response"
+                    );
+
+                    // Build response with incomplete status
+                    let mut response = build_function_tool_response(
+                        tool_calls,
+                        analysis,
+                        partial_text,
+                        usage,
+                        request_id,
+                        Arc::new(current_request),
+                    );
+
+                    // Mark as completed with incomplete_details
+                    response.status = ResponseStatus::Completed;
+                    response.incomplete_details = Some(json!({ "reason": "max_tool_calls" }));
+
+                    // Inject MCP metadata if any calls were executed
+                    if mcp_tracking.total_calls() > 0 {
+                        inject_mcp_metadata(&mut response, &mcp_tracking, &ctx.mcp_manager);
+                    }
+
+                    return Ok(response);
+                }
+
+                // Execute MCP tools
+                let tool_results =
+                    execute_mcp_tools(&ctx.mcp_manager, &tool_calls, &mut mcp_tracking).await?;
+
                // Build next request with appended history
                current_request = build_next_request_with_tools(
                    current_request,
@@ -361,30 +389,71 @@ pub async fn serve_harmony_responses(
                    output_items = response.output.len(),
                    input_tokens = usage.prompt_tokens,
                    output_tokens = usage.completion_tokens,
-                    has_mcp_tracking = mcp_tracking.is_some(),
-                    "Harmony Responses serving completed - no more tool calls"
+                    "MCP loop completed - no more tool calls"
                );

-                // Inject MCP output items if MCP tools were available
-                // (even if no tools were called, we still list available tools)
-                if let Some(tracking) = mcp_tracking {
-                    inject_mcp_metadata(&mut response, &tracking, &ctx.mcp_manager);
+                // Inject MCP metadata into final response
+                inject_mcp_metadata(&mut response, &mcp_tracking, &ctx.mcp_manager);

-                    debug!(
-                        mcp_calls = tracking.total_calls(),
-                        output_items_after = response.output.len(),
-                        "Injected MCP metadata into final response"
-                    );
-                }
+                debug!(
+                    mcp_calls = mcp_tracking.total_calls(),
+                    output_items_after = response.output.len(),
+                    "Injected MCP metadata into final response"
+                );

                // No tool calls - this is the final response
-                // TODO: Accumulate usage across all iterations if needed
                return Ok(*response);
            }
        }
    }
 }

+/// Execute Harmony Responses without MCP loop (single execution)
+///
+/// For function tools or no tools - executes pipeline once and returns
+async fn execute_without_mcp_loop(
+    ctx: &HarmonyResponsesContext,
+    current_request: ResponsesRequest,
+) -> Result<ResponsesResponse, Response> {
+    debug!("Executing Harmony Responses without MCP loop");
+
+    // Execute pipeline once
+    let iteration_result = ctx
+        .pipeline
+        .execute_harmony_responses(&current_request, ctx)
+        .await?;
+
+    match iteration_result {
+        ResponsesIterationResult::ToolCallsFound {
+            tool_calls,
+            analysis,
+            partial_text,
+            usage,
+            request_id,
+        } => {
+            // Function tool calls found - return to caller for execution
+            debug!(
+                tool_call_count = tool_calls.len(),
+                "Function tool calls found - returning to caller"
+            );
+
+            Ok(build_function_tool_response(
+                tool_calls,
+                analysis,
+                partial_text,
+                usage,
+                request_id,
+                Arc::new(current_request),
+            ))
+        }
+        ResponsesIterationResult::Completed { response, usage: _ } => {
+            // No tool calls - return completed response
+            debug!("No tool calls - returning completed response");
+            Ok(*response)
+        }
+    }
+}
+
 /// Serve Harmony Responses API with streaming (SSE)
 ///
 /// This is the streaming equivalent of `serve_harmony_responses()`.
@@ -412,14 +481,20 @@ pub async fn serve_harmony_responses_stream(
    request: ResponsesRequest,
 ) -> Response {
    // Load previous conversation history if previous_response_id is set
-    let mut current_request = match load_previous_messages(ctx, request).await {
+    let current_request = match load_previous_messages(ctx, request).await {
        Ok(req) => req,
        Err(err_response) => return err_response,
    };

+    // Check MCP connection BEFORE starting stream and get whether MCP tools are present
+    let has_mcp_tools =
+        match ensure_mcp_connection(&ctx.mcp_manager, current_request.tools.as_deref()).await {
+            Ok(has_mcp) => has_mcp,
+            Err(response) => return response,
+        };
+
    // Create SSE channel
    let (tx, rx) = mpsc::unbounded_channel();
-    let stream = UnboundedReceiverStream::new(rx);

    // Create response event emitter
    let response_id = format!("resp_{}", Uuid::new_v4());
@@ -437,24 +512,6 @@ pub async fn serve_harmony_responses_stream(
    tokio::spawn(async move {
        let ctx = &ctx_clone;

-        // Clone response_id for closure to avoid borrow conflicts
-        let response_id_for_error = response_id.clone();
-
-        // Helper to emit error and return
-        let emit_error = |tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>, error_msg: &str| {
-            // Create error event manually since emit_failed doesn't exist
-            let event = json!({
-                "type": "response.failed",
-                "response_id": response_id_for_error,
-                "error": {
-                    "message": error_msg,
-                    "type": "internal_error"
-                }
-            });
-            let sse_data = format!("data: {}\n\n", to_string(&event).unwrap());
-            let _ = tx.send(Ok(Bytes::from(sse_data)));
-        };
-
        // Emit initial response.created and response.in_progress events
        let event = emitter.emit_created();
        if emitter.send_event(&event, &tx).is_err() {
@@ -465,240 +522,413 @@ pub async fn serve_harmony_responses_stream(
            return;
        }

-        // Check if request has MCP tools
-        let has_mcp_tools = current_request
-            .tools
-            .as_ref()
-            .map(|tools| {
-                tools
-                    .iter()
-                    .any(|t| matches!(t.r#type, ResponseToolType::Mcp))
-            })
-            .unwrap_or(false);
-
-        // Initialize MCP call tracking
-        let mut mcp_tracking = if has_mcp_tools {
-            Some(McpCallTracking::new("sglang-mcp".to_string()))
+        if has_mcp_tools {
+            execute_mcp_tool_loop_streaming(ctx, current_request, &mut emitter, &tx).await;
        } else {
-            None
-        };
+            execute_without_mcp_streaming(ctx, &current_request, &mut emitter, &tx).await;
+        }
+    });

-        // Setup MCP tools if needed
-        if has_mcp_tools {
-            // Ensure dynamic MCP client is registered
-            if let Some(tools) = &current_request.tools {
-                ensure_request_mcp_client(&ctx.mcp_manager, tools).await;
-            }
+    // Return SSE stream response
+    build_sse_response(rx)
+}

-            // Add static MCP tools from inventory
-            let mcp_tools = ctx.mcp_manager.list_tools();
-            if !mcp_tools.is_empty() {
-                let mcp_response_tools = convert_mcp_tools_to_response_tools(&mcp_tools);
-                let mut all_tools = current_request.tools.clone().unwrap_or_default();
-                all_tools.extend(mcp_response_tools);
-                current_request.tools = Some(all_tools);
+// Execute MCP tool loop with streaming
+///
+/// Handles the full MCP workflow:
+/// - Adds static MCP tools to request
+/// - Emits mcp_list_tools events
+/// - Loops through tool execution iterations
+/// - Emits final response.completed event
+async fn execute_mcp_tool_loop_streaming(
+    ctx: &HarmonyResponsesContext,
+    mut current_request: ResponsesRequest,
+    emitter: &mut ResponseStreamEventEmitter,
+    tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+) {
+    // Initialize MCP call tracking
+    let mut mcp_tracking = McpCallTracking::new("sglang-mcp".to_string());

-                debug!(
-                    mcp_tool_count = mcp_tools.len(),
-                    total_tool_count = current_request.tools.as_ref().map(|t| t.len()).unwrap_or(0),
-                    "Added static MCP tools to Harmony Responses streaming request"
-                );
-            }
-        }
+    // Extract user's max_tool_calls limit (if set)
+    let max_tool_calls = current_request.max_tool_calls.map(|n| n as usize);

-        // Emit mcp_list_tools on first iteration (only if MCP tools available)
-        if has_mcp_tools {
-            let mcp_tools = ctx.mcp_manager.list_tools();
-            let (output_index, item_id) =
-                emitter.allocate_output_index(OutputItemType::McpListTools);
-
-            // Build tools list for item structure
-            let tool_items: Vec<_> = mcp_tools
-                .iter()
-                .map(|t| {
-                    json!({
-                        "name": t.name,
-                        "description": t.description,
-                        "input_schema": Value::Object((*t.input_schema).clone())
-                    })
-                })
-                .collect();
-
-            // Emit output_item.added
-            let item = json!({
-                "id": item_id,
-                "type": "mcp_list_tools",
-                "server_label": "sglang-mcp",
-                "status": "in_progress",
-                "tools": []
-            });
-            let event = emitter.emit_output_item_added(output_index, &item);
-            if emitter.send_event(&event, &tx).is_err() {
-                return;
-            }
+    // Add static MCP tools from inventory
+    let mcp_tools = ctx.mcp_manager.list_tools();
+    if !mcp_tools.is_empty() {
+        let mcp_response_tools = convert_mcp_tools_to_response_tools(&mcp_tools);
+        let mut all_tools = current_request.tools.clone().unwrap_or_default();
+        all_tools.extend(mcp_response_tools);
+        current_request.tools = Some(all_tools);

-            // Emit mcp_list_tools.in_progress
-            let event = emitter.emit_mcp_list_tools_in_progress(output_index);
-            if emitter.send_event(&event, &tx).is_err() {
-                return;
-            }
+        debug!(
+            mcp_tool_count = mcp_tools.len(),
+            total_tool_count = current_request.tools.as_ref().map(|t| t.len()).unwrap_or(0),
+            "MCP client available - added static MCP tools to Harmony Responses streaming request"
+        );
+    }

-            // Emit mcp_list_tools.completed
-            let event = emitter.emit_mcp_list_tools_completed(output_index, &mcp_tools);
-            if emitter.send_event(&event, &tx).is_err() {
-                return;
-            }
+    // Emit mcp_list_tools on first iteration
+    let (output_index, item_id) = emitter.allocate_output_index(OutputItemType::McpListTools);

-            // Emit output_item.done
-            let item_done = json!({
-                "id": item_id,
-                "type": "mcp_list_tools",
-                "server_label": "sglang-mcp",
-                "status": "completed",
-                "tools": tool_items
-            });
-            let event = emitter.emit_output_item_done(output_index, &item_done);
-            if emitter.send_event(&event, &tx).is_err() {
-                return;
-            }
+    // Build tools list for item structure
+    let tool_items: Vec<_> = mcp_tools
+        .iter()
+        .map(|t| {
+            json!({
+                "name": t.name,
+                "description": t.description,
+                "input_schema": Value::Object((*t.input_schema).clone())
+            })
+        })
+        .collect();

-            emitter.complete_output_item(output_index);
+    // Emit output_item.added
+    let item = json!({
+        "id": item_id,
+        "type": "mcp_list_tools",
+        "server_label": "sglang-mcp",
+        "status": "in_progress",
+        "tools": []
+    });
+    let event = emitter.emit_output_item_added(output_index, &item);
+    if emitter.send_event(&event, tx).is_err() {
+        return;
+    }

-            debug!(
-                tool_count = mcp_tools.len(),
-                "Emitted mcp_list_tools on first iteration"
+    // Emit mcp_list_tools.in_progress
+    let event = emitter.emit_mcp_list_tools_in_progress(output_index);
+    if emitter.send_event(&event, tx).is_err() {
+        return;
+    }
+
+    // Emit mcp_list_tools.completed
+    let event = emitter.emit_mcp_list_tools_completed(output_index, &mcp_tools);
+    if emitter.send_event(&event, tx).is_err() {
+        return;
+    }
+
+    // Emit output_item.done
+    let item_done = json!({
+        "id": item_id,
+        "type": "mcp_list_tools",
+        "server_label": "sglang-mcp",
+        "status": "completed",
+        "tools": tool_items
+    });
+    let event = emitter.emit_output_item_done(output_index, &item_done);
+    if emitter.send_event(&event, tx).is_err() {
+        return;
+    }
+
+    emitter.complete_output_item(output_index);
+
+    debug!(
+        tool_count = mcp_tools.len(),
+        "Emitted mcp_list_tools on first iteration"
+    );
+
+    // MCP tool loop (max 10 iterations)
+    let mut iteration_count = 0;
+    loop {
+        iteration_count += 1;
+
+        // Safety check: prevent infinite loops
+        if iteration_count > MAX_TOOL_ITERATIONS {
+            emitter.emit_error(
+                &format!("Maximum tool iterations ({}) exceeded", MAX_TOOL_ITERATIONS),
+                Some("max_iterations_exceeded"),
+                tx,
            );
+            return;
        }

-        // Tool loop (max 10 iterations)
-        let mut iteration_count = 0;
-        loop {
-            iteration_count += 1;
+        debug!(
+            iteration = iteration_count,
+            "Harmony Responses streaming iteration"
+        );

-            // Safety check: prevent infinite loops
-            if iteration_count > MAX_TOOL_ITERATIONS {
-                let error_msg =
-                    format!("Maximum tool iterations ({}) exceeded", MAX_TOOL_ITERATIONS);
-                emit_error(&tx, &error_msg);
+        // Execute pipeline and get stream
+        let execution_result = match ctx
+            .pipeline
+            .execute_harmony_responses_streaming(&current_request, ctx)
+            .await
+        {
+            Ok(result) => result,
+            Err(err_response) => {
+                emitter.emit_error(
+                    &format!("Pipeline execution failed: {:?}", err_response),
+                    Some("pipeline_error"),
+                    tx,
+                );
                return;
            }
+        };

-            debug!(
-                iteration = iteration_count,
-                "Harmony Responses streaming iteration"
-            );
-
-            // Execute through pipeline and get raw stream
-            let execution_result = match ctx
-                .pipeline
-                .execute_harmony_responses_streaming(&current_request, ctx)
-                .await
-            {
-                Ok(result) => result,
-                Err(err_response) => {
-                    let error_msg = format!("Pipeline execution failed: {:?}", err_response);
-                    emit_error(&tx, &error_msg);
-                    return;
-                }
-            };
-
-            // Process stream with token-level streaming using HarmonyStreamingProcessor
-            let iteration_result = match super::streaming::HarmonyStreamingProcessor::process_responses_iteration_stream(
+        // Process stream with token-level streaming (MCP path - emits mcp_call.* events)
+        let iteration_result =
+            match HarmonyStreamingProcessor::process_responses_iteration_stream_mcp(
                execution_result,
-                &mut emitter,
-                &tx,
+                emitter,
+                tx,
            )
            .await
            {
                Ok(result) => result,
                Err(err_msg) => {
-                    emit_error(&tx, &err_msg);
+                    emitter.emit_error(&err_msg, Some("processing_error"), tx);
                    return;
                }
            };

-            // Handle iteration result (tool calls or completion)
-            match iteration_result {
-                ResponsesIterationResult::ToolCallsFound {
-                    tool_calls,
-                    analysis,
-                    partial_text,
-                } => {
-                    debug!(
-                        tool_call_count = tool_calls.len(),
-                        has_analysis = analysis.is_some(),
-                        partial_text_len = partial_text.len(),
-                        "Tool calls found in commentary channel"
-                    );
-
-                    // Execute MCP tools
-                    let tool_results = if let Some(ref mut tracking) = mcp_tracking {
-                        match execute_mcp_tools(&ctx.mcp_manager, &tool_calls, tracking).await {
-                            Ok(results) => results,
-                            Err(err_response) => {
-                                let error_msg =
-                                    format!("MCP tool execution failed: {:?}", err_response);
-                                emit_error(&tx, &error_msg);
-                                return;
-                            }
-                        }
-                    } else {
-                        let error_msg = "Tool calls found but MCP tracking not initialized";
-                        emit_error(&tx, error_msg);
-                        return;
-                    };
+        // Handle iteration result (tool calls or completion)
+        match iteration_result {
+            ResponsesIterationResult::ToolCallsFound {
+                tool_calls,
+                analysis,
+                partial_text,
+                usage,
+                request_id: _,
+            } => {
+                debug!(
+                    tool_call_count = tool_calls.len(),
+                    has_analysis = analysis.is_some(),
+                    partial_text_len = partial_text.len(),
+                    "MCP tool calls found in commentary channel - checking limits"
+                );

-                    // Build next request with appended history
-                    current_request = match build_next_request_with_tools(
-                        current_request,
-                        tool_calls,
-                        tool_results,
-                        analysis,
-                        partial_text,
-                    ) {
-                        Ok(req) => req,
-                        Err(e) => {
-                            let error_msg = format!("Failed to build next request: {:?}", e);
-                            emit_error(&tx, &error_msg);
-                            return;
-                        }
-                    };
+                // Check combined limit (user's max_tool_calls vs safety limit)
+                let effective_limit = match max_tool_calls {
+                    Some(user_max) => user_max.min(MAX_TOOL_ITERATIONS),
+                    None => MAX_TOOL_ITERATIONS,
+                };

-                    // Continue loop
-                }
-                ResponsesIterationResult::Completed { response, usage } => {
-                    debug!(
-                        output_items = response.output.len(),
-                        input_tokens = usage.prompt_tokens,
-                        output_tokens = usage.completion_tokens,
-                        "Harmony Responses streaming completed - no more tool calls"
+                // Check if we would exceed the limit with these new tool calls
+                let total_calls_after = mcp_tracking.total_calls() + tool_calls.len();
+                if total_calls_after > effective_limit {
+                    warn!(
+                        current_calls = mcp_tracking.total_calls(),
+                        new_calls = tool_calls.len(),
+                        total_after = total_calls_after,
+                        effective_limit = effective_limit,
+                        user_max = ?max_tool_calls,
+                        "Reached tool call limit in streaming - emitting completion with incomplete_details"
                    );

-                    // Emit response.completed with usage
+                    // Emit response.completed with incomplete_details and usage
+                    let incomplete_details = json!({ "reason": "max_tool_calls" });
                    let usage_json = json!({
                        "prompt_tokens": usage.prompt_tokens,
                        "completion_tokens": usage.completion_tokens,
                        "total_tokens": usage.total_tokens,
+                        "incomplete_details": incomplete_details,
                    });
                    let event = emitter.emit_completed(Some(&usage_json));
-                    emitter.send_event_best_effort(&event, &tx);
-
-                    // Close channel
-                    drop(tx);
+                    emitter.send_event_best_effort(&event, tx);
                    return;
                }
+
+                // Execute MCP tools and continue loop
+                let tool_results =
+                    match execute_mcp_tools(&ctx.mcp_manager, &tool_calls, &mut mcp_tracking).await
+                    {
+                        Ok(results) => results,
+                        Err(err_response) => {
+                            emitter.emit_error(
+                                &format!("MCP tool execution failed: {:?}", err_response),
+                                Some("mcp_tool_error"),
+                                tx,
+                            );
+                            return;
+                        }
+                    };
+
+                // Build next request with appended history
+                current_request = match build_next_request_with_tools(
+                    current_request,
+                    tool_calls,
+                    tool_results,
+                    analysis,
+                    partial_text,
+                ) {
+                    Ok(req) => req,
+                    Err(e) => {
+                        emitter.emit_error(
+                            &format!("Failed to build next request: {:?}", e),
+                            Some("request_building_error"),
+                            tx,
+                        );
+                        return;
+                    }
+                };
+
+                // Continue loop
+            }
+            ResponsesIterationResult::Completed { response, usage } => {
+                debug!(
+                    output_items = response.output.len(),
+                    input_tokens = usage.prompt_tokens,
+                    output_tokens = usage.completion_tokens,
+                    "Harmony Responses streaming completed - no more tool calls"
+                );
+
+                // Emit response.completed with usage
+                let usage_json = json!({
+                    "prompt_tokens": usage.prompt_tokens,
+                    "completion_tokens": usage.completion_tokens,
+                    "total_tokens": usage.total_tokens,
+                });
+                let event = emitter.emit_completed(Some(&usage_json));
+                emitter.send_event_best_effort(&event, tx);
+                return;
            }
        }
-    });
+    }
+}

-    // Return SSE stream response
-    Response::builder()
-        .status(StatusCode::OK)
-        .header("Content-Type", "text/event-stream")
-        .header("Cache-Control", "no-cache")
-        .header("Connection", "keep-alive")
-        .body(Body::from_stream(stream))
+/// Execute without MCP tool loop (single execution with streaming)
+///
+/// For function tools or no tools - executes pipeline once and emits completion.
+/// The streaming processor handles all output items (reasoning, message, function tool calls).
+async fn execute_without_mcp_streaming(
+    ctx: &HarmonyResponsesContext,
+    current_request: &ResponsesRequest,
+    emitter: &mut ResponseStreamEventEmitter,
+    tx: &mpsc::UnboundedSender<Result<Bytes, std::io::Error>>,
+) {
+    debug!("No MCP tools - executing single iteration");
+
+    // Execute pipeline and get stream
+    let execution_result = match ctx
+        .pipeline
+        .execute_harmony_responses_streaming(current_request, ctx)
+        .await
+    {
+        Ok(result) => result,
+        Err(err_response) => {
+            emitter.emit_error(
+                &format!("Pipeline execution failed: {:?}", err_response),
+                Some("pipeline_error"),
+                tx,
+            );
+            return;
+        }
+    };
+
+    // Process stream (emits all output items during streaming - function tool path emits function_call_arguments.* events)
+    if let Err(err_msg) = HarmonyStreamingProcessor::process_responses_iteration_stream_function(
+        execution_result,
+        emitter,
+        tx,
+    )
+    .await
+    {
+        emitter.emit_error(&err_msg, Some("processing_error"), tx);
+        return;
+    }
+
+    // Emit response.completed
+    let event = emitter.emit_completed(None);
+    emitter.send_event_best_effort(&event, tx);
+}
+
+/// Build ResponsesResponse with function tool calls for caller to execute
+///
+/// When tool calls are found but no MCP client is available (function tools only),
+/// this builds a response with status=Completed and tool calls without output field.
+/// The absence of output signals the caller should execute tools and resume.
+///
+/// TODO: Refactor to use builder pattern
+fn build_function_tool_response(
+    tool_calls: Vec<ToolCall>,
+    analysis: Option<String>,
+    partial_text: String,
+    usage: Usage,
+    request_id: String,
+    responses_request: Arc<ResponsesRequest>,
+) -> ResponsesResponse {
+    let mut output: Vec<ResponseOutputItem> = Vec::new();
+
+    // Add reasoning output item if analysis exists
+    if let Some(analysis_text) = analysis {
+        output.push(ResponseOutputItem::Reasoning {
+            id: format!("reasoning_{}", request_id),
+            summary: vec![],
+            content: vec![ResponseReasoningContent::ReasoningText {
+                text: analysis_text,
+            }],
+            status: Some("completed".to_string()),
+        });
+    }
+
+    // Add message output item if partial text exists
+    if !partial_text.is_empty() {
+        output.push(ResponseOutputItem::Message {
+            id: format!("msg_{}", request_id),
+            role: "assistant".to_string(),
+            content: vec![ResponseContentPart::OutputText {
+                text: partial_text,
+                annotations: vec![],
+                logprobs: None,
+            }],
+            status: "completed".to_string(),
+        });
+    }
+
+    // Add function tool calls as completed output items (no output field = needs execution)
+    for tool_call in tool_calls {
+        output.push(ResponseOutputItem::FunctionToolCall {
+            id: tool_call.id.clone(),
+            call_id: tool_call.id.clone(),
+            name: tool_call.function.name.clone(),
+            arguments: tool_call.function.arguments.clone().unwrap_or_default(),
+            output: None, // No output = tool needs execution by caller
+            status: "completed".to_string(),
+        });
+    }
+
+    // Build ResponsesResponse with Completed status
+    // The presence of FunctionToolCall items without output signals tool execution needed
+    let created_at = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
        .unwrap()
+        .as_secs() as i64;
+
+    ResponsesResponse {
+        id: request_id,
+        object: "response".to_string(),
+        created_at,
+        status: ResponseStatus::Completed,
+        error: None,
+        incomplete_details: None,
+        instructions: responses_request.instructions.clone(),
+        max_output_tokens: responses_request.max_output_tokens,
+        model: responses_request.model.clone(),
+        output,
+        parallel_tool_calls: responses_request.parallel_tool_calls.unwrap_or(true),
+        previous_response_id: responses_request.previous_response_id.clone(),
+        reasoning: None,
+        store: responses_request.store.unwrap_or(true),
+        temperature: responses_request.temperature,
+        text: None,
+        tool_choice: responses_request
+            .tool_choice
+            .as_ref()
+            .map(|tc| to_string(tc).unwrap_or_else(|_| "auto".to_string()))
+            .unwrap_or_else(|| "auto".to_string()),
+        tools: responses_request.tools.clone().unwrap_or_default(),
+        top_p: responses_request.top_p,
+        truncation: None,
+        usage: Some(ResponsesUsage::Modern(ResponseUsage {
+            input_tokens: usage.prompt_tokens,
+            output_tokens: usage.completion_tokens,
+            total_tokens: usage.total_tokens,
+            input_tokens_details: None,
+            output_tokens_details: None,
+        })),
+        user: None,
+        safety_identifier: responses_request.user.clone(),
+        metadata: responses_request.metadata.clone().unwrap_or_default(),
+    }
 }

 /// Execute MCP tools and collect results
@@ -758,8 +988,7 @@ async fn execute_mcp_tools(

                // Extract content from MCP result
                let output = if let Some(content) = mcp_result.content.first() {
-                    // TODO: Handle different content types (text, image, resource)
-                    // For now, serialize the entire content item
+                    // Serialize the entire content item
                    to_value(content)
                        .unwrap_or_else(|_| json!({"error": "Failed to serialize tool result"}))
                } else {

--- a/sgl-router/src/routers/grpc/harmony/stages/preparation.rs
+++ b/sgl-router/src/routers/grpc/harmony/stages/preparation.rs
@@ -214,7 +214,7 @@ impl HarmonyPreparationStage {
            let params_schema = &tool.function.parameters;

            tags.push(json!({
-                "begin": format!("<|channel|>commentary to=functions.{}<|constrain|>json<|message|>", tool_name),
+                "begin": format!("<|start|>assistant<|channel|>commentary to=functions.{}<|constrain|>json<|message|>", tool_name),
                "content": {
                    "type": "json_schema",
                    "json_schema": params_schema
@@ -228,7 +228,7 @@ impl HarmonyPreparationStage {
        let structural_tag = json!({
            "format": {
                "type": "triggered_tags",
-                "triggers": ["<|channel|>commentary"],
+                "triggers": ["<|start|>assistant"],
                "tags": tags,
                "at_least_one": true,
                "stop_after_first": stop_after_first

--- a/sgl-router/src/routers/grpc/harmony/streaming.rs
+++ b/sgl-router/src/routers/grpc/harmony/streaming.rs
@@ -35,6 +35,71 @@ use crate::{
        context,
    },
 };
+
+/// Mode for tool call event emission
+#[derive(Debug, Clone, Copy)]
+enum ToolCallMode {
+    /// MCP tool calls (emit .in_progress and .completed events)
+    Mcp,
+    /// Function tool calls (no status events, only arguments streaming)
+    Function,
+}
+
+impl ToolCallMode {
+    /// Get the output item type for this mode
+    fn output_item_type(&self) -> OutputItemType {
+        match self {
+            Self::Mcp => OutputItemType::McpCall,
+            Self::Function => OutputItemType::FunctionCall,
+        }
+    }
+
+    /// Get the type string for JSON output
+    fn type_str(&self) -> &'static str {
+        match self {
+            Self::Mcp => "mcp_call",
+            Self::Function => "function_call",
+        }
+    }
+
+    /// Whether this mode emits status events (.in_progress, .completed)
+    fn emits_status_events(&self) -> bool {
+        matches!(self, Self::Mcp)
+    }
+
+    /// Emit arguments delta event
+    fn emit_arguments_delta(
+        &self,
+        emitter: &mut ResponseStreamEventEmitter,
+        output_index: usize,
+        item_id: &str,
+        delta: &str,
+    ) -> serde_json::Value {
+        match self {
+            Self::Mcp => emitter.emit_mcp_call_arguments_delta(output_index, item_id, delta),
+            Self::Function => {
+                emitter.emit_function_call_arguments_delta(output_index, item_id, delta)
+            }
+        }
+    }
+
+    /// Emit arguments done event
+    fn emit_arguments_done(
+        &self,
+        emitter: &mut ResponseStreamEventEmitter,
+        output_index: usize,
+        item_id: &str,
+        arguments: &str,
+    ) -> serde_json::Value {
+        match self {
+            Self::Mcp => emitter.emit_mcp_call_arguments_done(output_index, item_id, arguments),
+            Self::Function => {
+                emitter.emit_function_call_arguments_done(output_index, item_id, arguments)
+            }
+        }
+    }
+}
+
 /// Processor for streaming Harmony responses
 ///
 /// Returns an SSE stream that parses Harmony tokens incrementally and
@@ -531,14 +596,14 @@ impl HarmonyStreamingProcessor {
        Ok(())
    }

-    /// Common decode stream processing logic for both single and dual stream modes
+    /// Decode stream processing for tool loops
    ///
-    /// This helper function contains the shared logic for processing the decode stream,
-    /// parsing Harmony tokens, emitting SSE events, and tracking state.
-    async fn process_decode_stream_common(
+    /// Emits tool call events based on the mode (MCP or Function).
+    async fn process_decode_stream(
        mut decode_stream: AbortOnDropStream,
        emitter: &mut ResponseStreamEventEmitter,
        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+        mode: ToolCallMode,
    ) -> Result<ResponsesIterationResult, String> {
        // Initialize Harmony parser for this iteration
        let mut parser =
@@ -555,12 +620,14 @@ impl HarmonyStreamingProcessor {
        let mut message_item_id: Option<String> = None;
        let mut has_emitted_content_part_added = false;

-        // MCP tool call tracking (call_index -> (output_index, item_id))
-        let mut mcp_call_tracking: HashMap<usize, (usize, String)> = HashMap::new();
+        // Tool call tracking (call_index -> (output_index, item_id))
+        let mut tool_call_tracking: HashMap<usize, (usize, String)> = HashMap::new();

        // Metadata from Complete message
        let mut finish_reason = String::from("stop");
        let mut matched_stop: Option<serde_json::Value> = None;
+        let mut prompt_tokens: u32 = 0;
+        let mut completion_tokens: u32 = 0;

        // Process stream
        let mut chunk_count = 0;
@@ -646,29 +713,53 @@ impl HarmonyStreamingProcessor {
                            }
                        }

-                        // Commentary channel → MCP tool call streaming
+                        // Commentary channel → Tool call streaming
                        if let Some(tc_delta) = &delta.commentary_delta {
                            let call_index = tc_delta.index;

                            // Check if this is a new tool call (has id and name)
                            if tc_delta.id.is_some() {
-                                // NEW MCP CALL: Allocate output item and emit in_progress
+                                // NEW TOOL CALL: Allocate output item
                                let (output_index, item_id) =
-                                    emitter.allocate_output_index(OutputItemType::McpCall);
+                                    emitter.allocate_output_index(mode.output_item_type());

                                // Store tracking info
-                                mcp_call_tracking
+                                tool_call_tracking
                                    .insert(call_index, (output_index, item_id.clone()));

-                                // Emit mcp_call.in_progress
-                                let event =
-                                    emitter.emit_mcp_call_in_progress(output_index, &item_id);
+                                // Get tool name
+                                let tool_name = tc_delta
+                                    .function
+                                    .as_ref()
+                                    .and_then(|f| f.name.as_ref())
+                                    .map(|n| n.as_str())
+                                    .unwrap_or("");
+
+                                // Emit output_item.added wrapper event
+                                let call_id = tc_delta.id.as_ref().unwrap();
+                                let item = json!({
+                                    "id": item_id,
+                                    "type": mode.type_str(),
+                                    "name": tool_name,
+                                    "call_id": call_id,
+                                    "arguments": "",
+                                    "status": "in_progress"
+                                });
+                                let event = emitter.emit_output_item_added(output_index, &item);
                                emitter.send_event_best_effort(&event, tx);

-                                // If we have function name, emit initial mcp_call_arguments.delta
+                                // Emit status event if mode supports it (MCP only)
+                                if mode.emits_status_events() {
+                                    let event =
+                                        emitter.emit_mcp_call_in_progress(output_index, &item_id);
+                                    emitter.send_event_best_effort(&event, tx);
+                                }
+
+                                // If we have function name, emit initial arguments delta
                                if let Some(func) = &tc_delta.function {
                                    if func.name.is_some() {
-                                        let event = emitter.emit_mcp_call_arguments_delta(
+                                        let event = mode.emit_arguments_delta(
+                                            emitter,
                                            output_index,
                                            &item_id,
                                            "",
@@ -677,9 +768,9 @@ impl HarmonyStreamingProcessor {
                                    }
                                }
                            } else {
-                                // CONTINUING MCP CALL: Emit arguments delta
+                                // CONTINUING TOOL CALL: Emit arguments delta
                                if let Some((output_index, item_id)) =
-                                    mcp_call_tracking.get(&call_index)
+                                    tool_call_tracking.get(&call_index)
                                {
                                    if let Some(args) = tc_delta
                                        .function
@@ -687,7 +778,8 @@ impl HarmonyStreamingProcessor {
                                        .and_then(|f| f.arguments.as_ref())
                                        .filter(|a| !a.is_empty())
                                    {
-                                        let event = emitter.emit_mcp_call_arguments_delta(
+                                        let event = mode.emit_arguments_delta(
+                                            emitter,
                                            *output_index,
                                            item_id,
                                            args,
@@ -704,12 +796,14 @@ impl HarmonyStreamingProcessor {
                    finish_reason = complete.finish_reason.clone();
                    matched_stop = complete.matched_stop.as_ref().map(|m| match m {
                        MatchedTokenId(id) => {
-                            serde_json::json!(id)
+                            json!(id)
                        }
                        MatchedStopStr(s) => {
-                            serde_json::json!(s)
+                            json!(s)
                        }
                    });
+                    prompt_tokens = complete.prompt_tokens as u32;
+                    completion_tokens = complete.completion_tokens as u32;

                    // Finalize parser and get complete output
                    let final_output = parser
@@ -719,23 +813,42 @@ impl HarmonyStreamingProcessor {
                    // Store finalized tool calls
                    accumulated_tool_calls = final_output.commentary.clone();

-                    // Complete all MCP tool calls if we have commentary
+                    // Complete all tool calls if we have commentary
                    if let Some(ref tool_calls) = accumulated_tool_calls {
                        for (call_idx, tool_call) in tool_calls.iter().enumerate() {
-                            if let Some((output_index, item_id)) = mcp_call_tracking.get(&call_idx)
+                            if let Some((output_index, item_id)) = tool_call_tracking.get(&call_idx)
                            {
-                                // Emit mcp_call_arguments.done with final arguments
+                                let tool_name = &tool_call.function.name;
+
+                                // Emit arguments done with final arguments
                                let args_str =
                                    tool_call.function.arguments.as_deref().unwrap_or("");
-                                let event = emitter.emit_mcp_call_arguments_done(
+
+                                let event = mode.emit_arguments_done(
+                                    emitter,
                                    *output_index,
                                    item_id,
                                    args_str,
                                );
                                emitter.send_event_best_effort(&event, tx);

-                                // Emit mcp_call.completed
-                                let event = emitter.emit_mcp_call_completed(*output_index, item_id);
+                                // Emit status event if mode supports it (MCP only)
+                                if mode.emits_status_events() {
+                                    let event =
+                                        emitter.emit_mcp_call_completed(*output_index, item_id);
+                                    emitter.send_event_best_effort(&event, tx);
+                                }
+
+                                // Emit output_item.done wrapper event
+                                let item = json!({
+                                    "id": item_id,
+                                    "type": mode.type_str(),
+                                    "name": tool_name,
+                                    "call_id": &tool_call.id,
+                                    "arguments": args_str,
+                                    "status": "completed"
+                                });
+                                let event = emitter.emit_output_item_done(*output_index, &item);
                                emitter.send_event_best_effort(&event, tx);

                                // Mark output item as completed
@@ -811,19 +924,21 @@ impl HarmonyStreamingProcessor {
                final_text_extracted.len()
            );

-            // Complete any pending MCP tool calls with data from completed messages
+            // Complete any pending tool calls with data from completed messages
            if let Some(ref tool_calls) = accumulated_tool_calls {
                for (call_idx, tool_call) in tool_calls.iter().enumerate() {
-                    if let Some((output_index, item_id)) = mcp_call_tracking.get(&call_idx) {
-                        // Emit mcp_call_arguments.done with final arguments
+                    if let Some((output_index, item_id)) = tool_call_tracking.get(&call_idx) {
+                        // Emit arguments done with final arguments
                        let args_str = tool_call.function.arguments.as_deref().unwrap_or("");
                        let event =
-                            emitter.emit_mcp_call_arguments_done(*output_index, item_id, args_str);
+                            mode.emit_arguments_done(emitter, *output_index, item_id, args_str);
                        emitter.send_event_best_effort(&event, tx);

-                        // Emit mcp_call.completed
-                        let event = emitter.emit_mcp_call_completed(*output_index, item_id);
-                        emitter.send_event_best_effort(&event, tx);
+                        // Emit status event if mode supports it (MCP only)
+                        if mode.emits_status_events() {
+                            let event = emitter.emit_mcp_call_completed(*output_index, item_id);
+                            emitter.send_event_best_effort(&event, tx);
+                        }
                    }
                }
            }
@@ -848,6 +963,13 @@ impl HarmonyStreamingProcessor {
                    tool_calls,
                    analysis: analysis_content,
                    partial_text: accumulated_final_text,
+                    usage: Usage {
+                        prompt_tokens,
+                        completion_tokens,
+                        total_tokens: prompt_tokens + completion_tokens,
+                        completion_tokens_details: None,
+                    },
+                    request_id: emitter.response_id.clone(),
                });
            }
        }
@@ -857,7 +979,7 @@ impl HarmonyStreamingProcessor {
        // Return a placeholder Completed result (caller ignores these fields in streaming mode)
        Ok(ResponsesIterationResult::Completed {
            response: Box::new(ResponsesResponse {
-                id: String::new(),
+                id: emitter.response_id.clone(),
                object: "response".to_string(),
                created_at: 0,
                status: ResponseStatus::Completed,
@@ -881,76 +1003,136 @@ impl HarmonyStreamingProcessor {
                safety_identifier: None,
                metadata: HashMap::new(),
                usage: Some(ResponsesUsage::Modern(ResponseUsage {
-                    input_tokens: 0,
-                    output_tokens: 0,
-                    total_tokens: 0,
+                    input_tokens: prompt_tokens,
+                    output_tokens: completion_tokens,
+                    total_tokens: prompt_tokens + completion_tokens,
                    input_tokens_details: None,
                    output_tokens_details: None,
                })),
            }),
            usage: Usage {
-                prompt_tokens: 0,
-                completion_tokens: 0,
-                total_tokens: 0,
+                prompt_tokens,
+                completion_tokens,
+                total_tokens: prompt_tokens + completion_tokens,
                completion_tokens_details: None,
            },
        })
    }

-    /// Process streaming chunks for Responses API iteration
+    /// Process streaming chunks for Responses API iteration - MCP loop
    ///
-    /// Returns ResponsesIterationResult indicating whether tool calls were found
-    /// (requiring MCP loop continuation) or if the iteration is complete.
-    pub async fn process_responses_iteration_stream(
+    /// Emits mcp_call.* events for all tool calls
+    pub async fn process_responses_iteration_stream_mcp(
        execution_result: context::ExecutionResult,
        emitter: &mut ResponseStreamEventEmitter,
        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
    ) -> Result<ResponsesIterationResult, String> {
        match execution_result {
            context::ExecutionResult::Single { stream } => {
-                debug!("Processing Responses API single stream mode");
-                Self::process_responses_single_stream(stream, emitter, tx).await
+                debug!("Processing Responses API single stream mode (MCP)");
+                Self::process_responses_single_stream_mcp(stream, emitter, tx).await
            }
            context::ExecutionResult::Dual { prefill, decode } => {
-                debug!("Processing Responses API dual stream mode");
-                Self::process_responses_dual_stream(prefill, *decode, emitter, tx).await
+                debug!("Processing Responses API dual stream mode (MCP)");
+                Self::process_responses_dual_stream_mcp(prefill, *decode, emitter, tx).await
            }
        }
    }

-    /// Process streaming chunks from a single stream (Responses API)
-    async fn process_responses_single_stream(
+    /// Process streaming chunks for Responses API iteration - Function tools
+    ///
+    /// Emits function_call_arguments.* events for all tool calls
+    pub async fn process_responses_iteration_stream_function(
+        execution_result: context::ExecutionResult,
+        emitter: &mut ResponseStreamEventEmitter,
+        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+    ) -> Result<ResponsesIterationResult, String> {
+        match execution_result {
+            context::ExecutionResult::Single { stream } => {
+                debug!("Processing Responses API single stream mode (Function)");
+                Self::process_responses_single_stream_function(stream, emitter, tx).await
+            }
+            context::ExecutionResult::Dual { prefill, decode } => {
+                debug!("Processing Responses API dual stream mode (Function)");
+                Self::process_responses_dual_stream_function(prefill, *decode, emitter, tx).await
+            }
+        }
+    }
+
+    /// Process streaming chunks from a single stream - MCP loop
+    async fn process_responses_single_stream_mcp(
+        grpc_stream: AbortOnDropStream,
+        emitter: &mut ResponseStreamEventEmitter,
+        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+    ) -> Result<ResponsesIterationResult, String> {
+        Self::process_decode_stream(grpc_stream, emitter, tx, ToolCallMode::Mcp).await
+    }
+
+    /// Process streaming chunks from a single stream - Function tools
+    async fn process_responses_single_stream_function(
        grpc_stream: AbortOnDropStream,
        emitter: &mut ResponseStreamEventEmitter,
        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
    ) -> Result<ResponsesIterationResult, String> {
-        // Delegate to common helper
-        Self::process_decode_stream_common(grpc_stream, emitter, tx).await
+        Self::process_decode_stream(grpc_stream, emitter, tx, ToolCallMode::Function).await
    }

-    /// Process streaming chunks from dual streams (Responses API)
+    /// Process streaming chunks from dual streams (common implementation)
    async fn process_responses_dual_stream(
        mut prefill_stream: AbortOnDropStream,
        decode_stream: AbortOnDropStream,
        emitter: &mut ResponseStreamEventEmitter,
        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+        mode: ToolCallMode,
    ) -> Result<ResponsesIterationResult, String> {
        // Phase 1: Process prefill stream (collect metadata, no output)
        while let Some(result) = prefill_stream.next().await {
            let _response = result.map_err(|e| format!("Prefill stream error: {}", e))?;
-            // No-op for prefill in Responses API (just metadata collection)
        }

        // Phase 2: Process decode stream using common helper
-        let result = Self::process_decode_stream_common(decode_stream, emitter, tx).await;
+        let result = Self::process_decode_stream(decode_stream, emitter, tx, mode).await;

        // Mark prefill stream as completed AFTER decode completes successfully
        // This ensures that if client disconnects during decode, BOTH streams send abort
        prefill_stream.mark_completed();
-
        result
    }

+    /// Process streaming chunks from dual streams - MCP loop
+    async fn process_responses_dual_stream_mcp(
+        prefill_stream: AbortOnDropStream,
+        decode_stream: AbortOnDropStream,
+        emitter: &mut ResponseStreamEventEmitter,
+        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+    ) -> Result<ResponsesIterationResult, String> {
+        Self::process_responses_dual_stream(
+            prefill_stream,
+            decode_stream,
+            emitter,
+            tx,
+            ToolCallMode::Mcp,
+        )
+        .await
+    }
+
+    /// Process streaming chunks from dual streams - Function tools
+    async fn process_responses_dual_stream_function(
+        prefill_stream: AbortOnDropStream,
+        decode_stream: AbortOnDropStream,
+        emitter: &mut ResponseStreamEventEmitter,
+        tx: &mpsc::UnboundedSender<Result<Bytes, io::Error>>,
+    ) -> Result<ResponsesIterationResult, String> {
+        Self::process_responses_dual_stream(
+            prefill_stream,
+            decode_stream,
+            emitter,
+            tx,
+            ToolCallMode::Function,
+        )
+        .await
+    }
+
    /// Build SSE response from receiver
    fn build_sse_response(rx: mpsc::UnboundedReceiver<Result<Bytes, io::Error>>) -> Response {
        let stream = UnboundedReceiverStream::new(rx);

--- a/sgl-router/src/routers/grpc/regular/responses/conversions.rs
+++ b/sgl-router/src/routers/grpc/regular/responses/conversions.rs
@@ -226,7 +226,7 @@ pub fn responses_to_chat(req: &ResponsesRequest) -> Result<ChatCompletionRequest
        parallel_tool_calls: req.parallel_tool_calls,
        top_logprobs: req.top_logprobs,
        top_p: req.top_p,
-        skip_special_tokens: true, // Always skip special tokens // TODO: except for gpt-oss
+        skip_special_tokens: true,
        // Note: tools and tool_choice will be handled separately for MCP transformation
        tools: None,       // Will be set by caller if needed
        tool_choice: None, // Will be set by caller if needed

--- a/sgl-router/src/routers/grpc/regular/responses/handlers.rs
+++ b/sgl-router/src/routers/grpc/regular/responses/handlers.rs
@@ -36,14 +36,13 @@ use std::sync::Arc;

 use axum::{
    body::Body,
-    http::{self, header, StatusCode},
+    http::{self, StatusCode},
    response::{IntoResponse, Response},
 };
 use bytes::Bytes;
 use futures_util::StreamExt;
 use serde_json::json;
 use tokio::sync::mpsc;
-use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{debug, warn};
 use uuid::Uuid;
 use validator::Validate;
@@ -67,8 +66,13 @@ use crate::{
        },
    },
    routers::{
-        grpc::{common::responses::streaming::ResponseStreamEventEmitter, error},
-        openai::{conversations::persist_conversation_items, mcp::ensure_request_mcp_client},
+        grpc::{
+            common::responses::{
+                build_sse_response, ensure_mcp_connection, streaming::ResponseStreamEventEmitter,
+            },
+            error,
+        },
+        openai::conversations::persist_conversation_items,
    },
 };

@@ -81,33 +85,6 @@ pub async fn route_responses(
    headers: Option<http::HeaderMap>,
    model_id: Option<String>,
 ) -> Response {
-    // 0. Fast worker validation (fail-fast before expensive operations)
-    let requested_model: Option<&str> = model_id.as_deref().or(Some(request.model.as_str()));
-
-    if let Some(model) = requested_model {
-        // Check if any workers support this model
-        let available_models = ctx.worker_registry.get_models();
-
-        if !available_models.contains(&model.to_string()) {
-            return (
-                StatusCode::SERVICE_UNAVAILABLE,
-                axum::Json(json!({
-                    "error": {
-                        "message": format!(
-                            "No workers available for model '{}'. Available models: {}",
-                            model,
-                            available_models.join(", ")
-                        ),
-                        "type": "service_unavailable",
-                        "param": "model",
-                        "code": "no_available_workers"
-                    }
-                })),
-            )
-                .into_response();
-        }
-    }
-
    // 1. Validate request (includes conversation ID format)
    if let Err(validation_errors) = request.validate() {
        // Extract the first error message for conversation field
@@ -171,7 +148,10 @@ pub async fn route_responses(
    if is_streaming {
        route_responses_streaming(ctx, request, headers, model_id).await
    } else {
-        route_responses_sync(ctx, request, headers, model_id, None).await
+        // Generate response ID for synchronous execution
+        // TODO: we may remove this when we have builder pattern for responses
+        let response_id = Some(format!("resp_{}", Uuid::new_v4()));
+        route_responses_sync(ctx, request, headers, model_id, response_id).await
    }
 }

@@ -211,40 +191,24 @@ async fn route_responses_internal(
    // 1. Load conversation history and build modified request
    let modified_request = load_conversation_history(ctx, &request).await?;

-    // 2. Check if request has MCP tools - if so, use tool loop
-    let responses_response = if let Some(tools) = &request.tools {
-        // Ensure dynamic MCP client is registered for request-scoped tools
-        if ensure_request_mcp_client(&ctx.mcp_manager, tools)
-            .await
-            .is_some()
-        {
-            debug!("MCP tools detected, using tool loop");
+    // 2. Check MCP connection and get whether MCP tools are present
+    let has_mcp_tools = ensure_mcp_connection(&ctx.mcp_manager, request.tools.as_deref()).await?;

-            // Execute with MCP tool loop
-            execute_tool_loop(
-                ctx,
-                modified_request,
-                &request,
-                headers,
-                model_id,
-                response_id.clone(),
-            )
-            .await?
-        } else {
-            debug!("Failed to create MCP client from request tools");
-            // Fall through to non-MCP execution
-            execute_without_mcp(
-                ctx,
-                &modified_request,
-                &request,
-                headers,
-                model_id,
-                response_id.clone(),
-            )
-            .await?
-        }
+    let responses_response = if has_mcp_tools {
+        debug!("MCP tools detected, using tool loop");
+
+        // Execute with MCP tool loop
+        execute_tool_loop(
+            ctx,
+            modified_request,
+            &request,
+            headers,
+            model_id,
+            response_id.clone(),
+        )
+        .await?
    } else {
-        // No tools, execute normally
+        // No MCP tools - execute without MCP (may have function tools or no tools)
        execute_without_mcp(
            ctx,
            &modified_request,
@@ -289,18 +253,18 @@ async fn route_responses_streaming(
        Err(response) => return response, // Already a Response with proper status code
    };

-    // 2. Check if request has MCP tools - if so, use streaming tool loop
-    if let Some(tools) = &request.tools {
-        // Ensure dynamic MCP client is registered for request-scoped tools
-        if ensure_request_mcp_client(&ctx.mcp_manager, tools)
-            .await
-            .is_some()
-        {
-            debug!("MCP tools detected in streaming mode, using streaming tool loop");
+    // 2. Check MCP connection and get whether MCP tools are present
+    let has_mcp_tools =
+        match ensure_mcp_connection(&ctx.mcp_manager, request.tools.as_deref()).await {
+            Ok(has_mcp) => has_mcp,
+            Err(response) => return response,
+        };

-            return execute_tool_loop_streaming(ctx, modified_request, &request, headers, model_id)
-                .await;
-        }
+    if has_mcp_tools {
+        debug!("MCP tools detected in streaming mode, using streaming tool loop");
+
+        return execute_tool_loop_streaming(ctx, modified_request, &request, headers, model_id)
+            .await;
    }

    // 3. Convert ResponsesRequest → ChatCompletionRequest
@@ -352,8 +316,8 @@ async fn convert_chat_stream_to_responses_stream(
        )
        .await;

-    // Extract body and headers from chat response
-    let (parts, body) = chat_response.into_parts();
+    // Extract body from chat response
+    let (_parts, body) = chat_response.into_parts();

    // Create channel for transformed SSE events
    let (tx, rx) = mpsc::unbounded_channel::<Result<Bytes, std::io::Error>>();
@@ -392,29 +356,7 @@ async fn convert_chat_stream_to_responses_stream(
    });

    // Build SSE response with transformed stream
-    let stream = UnboundedReceiverStream::new(rx);
-    let body = Body::from_stream(stream);
-
-    let mut response = Response::builder().status(parts.status).body(body).unwrap();
-
-    // Copy headers from original chat response
-    *response.headers_mut() = parts.headers;
-
-    // Ensure SSE headers are set
-    response.headers_mut().insert(
-        header::CONTENT_TYPE,
-        header::HeaderValue::from_static("text/event-stream"),
-    );
-    response.headers_mut().insert(
-        header::CACHE_CONTROL,
-        header::HeaderValue::from_static("no-cache"),
-    );
-    response.headers_mut().insert(
-        header::CONNECTION,
-        header::HeaderValue::from_static("keep-alive"),
-    );
-
-    response
+    build_sse_response(rx)
 }

 /// Process chat SSE stream and transform to responses format

--- a/sgl-router/src/routers/grpc/router.rs
+++ b/sgl-router/src/routers/grpc/router.rs
@@ -10,7 +10,10 @@ use axum::{
 use tracing::debug;

 use super::{
-    common::responses::handlers::{cancel_response_impl, get_response_impl},
+    common::responses::{
+        handlers::{cancel_response_impl, get_response_impl},
+        utils::validate_worker_availability,
+    },
    context::SharedComponents,
    harmony::{
        serve_harmony_responses, serve_harmony_responses_stream, HarmonyDetector,
@@ -191,14 +194,18 @@ impl GrpcRouter {
        body: &ResponsesRequest,
        model_id: Option<&str>,
    ) -> Response {
+        // 0. Fast worker validation (fail-fast before expensive operations)
+        let requested_model: Option<&str> = model_id.or(Some(body.model.as_str()));
+
+        if let Some(error_response) = requested_model
+            .and_then(|model| validate_worker_availability(&self.worker_registry, model))
+        {
+            return error_response;
+        }
+
        // Choose implementation based on Harmony model detection
        let is_harmony = HarmonyDetector::is_harmony_model(&body.model);

-        debug!(
-            "Processing responses request for model: {:?}, using_harmony={}",
-            model_id, is_harmony
-        );
-
        if is_harmony {
            debug!(
                "Processing Harmony responses request for model: {:?}, streaming: {:?}",