Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
700daa34
Unverified
Commit
700daa34
authored
Oct 30, 2025
by
Simo Lin
Committed by
GitHub
Oct 30, 2025
Browse files
[router] harmony responses api streaming support (#12395)
parent
39cee0fe
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
984 additions
and
122 deletions
+984
-122
sgl-router/src/routers/grpc/harmony/mod.rs
sgl-router/src/routers/grpc/harmony/mod.rs
+3
-1
sgl-router/src/routers/grpc/harmony/parser.rs
sgl-router/src/routers/grpc/harmony/parser.rs
+46
-1
sgl-router/src/routers/grpc/harmony/responses.rs
sgl-router/src/routers/grpc/harmony/responses.rs
+356
-56
sgl-router/src/routers/grpc/harmony/stages/response_processing.rs
...er/src/routers/grpc/harmony/stages/response_processing.rs
+34
-19
sgl-router/src/routers/grpc/harmony/streaming.rs
sgl-router/src/routers/grpc/harmony/streaming.rs
+429
-5
sgl-router/src/routers/grpc/pipeline.rs
sgl-router/src/routers/grpc/pipeline.rs
+49
-0
sgl-router/src/routers/grpc/responses/streaming.rs
sgl-router/src/routers/grpc/responses/streaming.rs
+52
-33
sgl-router/src/routers/grpc/router.rs
sgl-router/src/routers/grpc/router.rs
+15
-7
No files found.
sgl-router/src/routers/grpc/harmony/mod.rs
View file @
700daa34
...
...
@@ -43,7 +43,9 @@ pub use builder::HarmonyBuilder;
pub
use
detector
::
HarmonyDetector
;
pub
use
parser
::
HarmonyParserAdapter
;
pub
use
processor
::{
HarmonyResponseProcessor
,
ResponsesIterationResult
};
pub
use
responses
::{
serve_harmony_responses
,
HarmonyResponsesContext
};
pub
use
responses
::{
serve_harmony_responses
,
serve_harmony_responses_stream
,
HarmonyResponsesContext
,
};
pub
use
stages
::{
HarmonyPreparationStage
,
HarmonyRequestBuildingStage
,
HarmonyResponseProcessingStage
,
};
...
...
sgl-router/src/routers/grpc/harmony/parser.rs
View file @
700daa34
...
...
@@ -103,7 +103,7 @@ impl HarmonyParserAdapter {
/// # Returns
///
/// Tuple of (analysis, commentary, final_text)
fn
parse_messages
(
pub
fn
parse_messages
(
messages
:
&
[
openai_harmony
::
chat
::
Message
],
)
->
(
Option
<
String
>
,
Option
<
Vec
<
ToolCall
>>
,
String
)
{
let
mut
analysis
=
None
;
...
...
@@ -260,6 +260,51 @@ impl HarmonyParserAdapter {
self
.parser
.messages
()
.to_vec
()
}
/// Extract incomplete commentary content from parser state
///
/// When the stream ends, there may be incomplete commentary content in the parser
/// that hasn't been finalized into a completed message. This method extracts
/// such content and converts it to tool calls.
///
/// # Returns
///
/// Optional vector of ToolCall if incomplete commentary is found
pub
fn
extract_incomplete_commentary
(
&
self
)
->
Option
<
Vec
<
ToolCall
>>
{
// Check if current channel is commentary
let
current_channel
=
self
.parser
.current_channel
();
if
current_channel
.as_deref
()
!=
Some
(
"commentary"
)
{
return
None
;
}
// Get current recipient (should be "functions.{name}")
let
recipient
=
self
.parser
.current_recipient
()
?
;
if
!
recipient
.starts_with
(
"functions."
)
{
return
None
;
}
// Get current incomplete content
let
content
=
self
.parser
.current_content
()
.ok
()
?
;
if
content
.is_empty
()
{
return
None
;
}
// Extract function name from recipient
let
function_name
=
recipient
.strip_prefix
(
"functions."
)
.unwrap
();
// Create tool call from incomplete content
let
call_id
=
format!
(
"call_{}"
,
Uuid
::
new_v4
());
let
tool_call
=
ToolCall
{
id
:
call_id
,
tool_type
:
"function"
.to_string
(),
function
:
FunctionCallResponse
{
name
:
function_name
.to_string
(),
arguments
:
Some
(
content
),
},
};
Some
(
vec!
[
tool_call
])
}
/// Parse streaming chunk
///
/// Parses incremental token IDs and returns a delta with any new content
...
...
sgl-router/src/routers/grpc/harmony/responses.rs
View file @
700daa34
...
...
@@ -36,10 +36,17 @@
//! See `/Users/simolin/workspace/sglang/.claude/docs/harmony_pipeline/tool_loop_design.md`
//! for complete architecture, rationale, and implementation details.
use
std
::
sync
::
Arc
;
use
std
::{
sync
::
Arc
,
time
::{
SystemTime
,
UNIX_EPOCH
},
};
use
axum
::
response
::
Response
;
use
axum
::
{
body
::
Body
,
http
::
StatusCode
,
response
::
Response
}
;
use
serde_json
::
Value
as
JsonValue
;
use
tokio
::
sync
::
mpsc
;
use
tokio_stream
::
wrappers
::
UnboundedReceiverStream
;
use
tracing
::{
debug
,
warn
};
use
uuid
::
Uuid
;
use
crate
::{
data_connector
::{
ResponseId
,
ResponseStorage
},
...
...
@@ -47,13 +54,19 @@ use crate::{
protocols
::{
common
::{
Function
,
ToolCall
},
responses
::{
ResponseInput
,
ResponseInputOutputItem
,
ResponseTool
,
Response
sRequest
,
ResponsesResponse
,
StringOrContentParts
,
ResponseInput
,
ResponseInputOutputItem
,
ResponseTool
,
Response
ToolType
,
ResponsesRequest
,
ResponsesResponse
,
StringOrContentParts
,
},
},
routers
::
grpc
::{
context
::
SharedComponents
,
harmony
::
processor
::
ResponsesIterationResult
,
pipeline
::
RequestPipeline
,
utils
,
routers
::{
grpc
::{
context
::
SharedComponents
,
harmony
::
processor
::
ResponsesIterationResult
,
pipeline
::
RequestPipeline
,
responses
::
streaming
::{
OutputItemType
,
ResponseStreamEventEmitter
},
utils
,
},
openai
::
mcp
::
ensure_request_mcp_client
,
},
};
...
...
@@ -93,7 +106,7 @@ struct McpCallTracking {
}
impl
McpCallTracking
{
fn
new
(
server_label
:
String
)
->
Self
{
pub
fn
new
(
server_label
:
String
)
->
Self
{
Self
{
server_label
,
tool_calls
:
Vec
::
new
(),
...
...
@@ -143,7 +156,7 @@ pub struct HarmonyResponsesContext {
pub
response_storage
:
Arc
<
dyn
ResponseStorage
>
,
/// Optional streaming sender (for future streaming support)
pub
stream_tx
:
Option
<
tokio
::
sync
::
mpsc
::
UnboundedSender
<
Result
<
String
,
String
>>>
,
pub
stream_tx
:
Option
<
mpsc
::
UnboundedSender
<
Result
<
String
,
String
>>>
,
}
impl
HarmonyResponsesContext
{
...
...
@@ -169,7 +182,7 @@ impl HarmonyResponsesContext {
components
:
Arc
<
SharedComponents
>
,
mcp_manager
:
Arc
<
McpManager
>
,
response_storage
:
Arc
<
dyn
ResponseStorage
>
,
stream_tx
:
tokio
::
sync
::
mpsc
::
UnboundedSender
<
Result
<
String
,
String
>>
,
stream_tx
:
mpsc
::
UnboundedSender
<
Result
<
String
,
String
>>
,
)
->
Self
{
Self
{
pipeline
,
...
...
@@ -226,12 +239,6 @@ pub async fn serve_harmony_responses(
let
mut
current_request
=
load_previous_messages
(
ctx
,
request
)
.await
?
;
let
mut
iteration_count
=
0
;
// Check if request has MCP tools - if so, ensure dynamic client is registered
// and add static MCP tools to the request
use
crate
::{
protocols
::
responses
::
ResponseToolType
,
routers
::
openai
::
mcp
::
ensure_request_mcp_client
,
};
let
has_mcp_tools
=
current_request
.tools
.as_ref
()
...
...
@@ -265,7 +272,7 @@ pub async fn serve_harmony_responses(
all_tools
.extend
(
mcp_response_tools
);
current_request
.tools
=
Some
(
all_tools
);
tracing
::
debug!
(
debug!
(
mcp_tool_count
=
mcp_tools
.len
(),
total_tool_count
=
current_request
.tools
.as_ref
()
.map
(|
t
|
t
.len
())
.unwrap_or
(
0
),
"Request has MCP tools - added static MCP tools to Harmony Responses request"
...
...
@@ -284,7 +291,7 @@ pub async fn serve_harmony_responses(
)));
}
tracing
::
debug!
(
debug!
(
iteration
=
iteration_count
,
"Harmony Responses serving iteration"
);
...
...
@@ -308,7 +315,7 @@ pub async fn serve_harmony_responses(
analysis
,
partial_text
,
}
=>
{
tracing
::
debug!
(
debug!
(
tool_call_count
=
tool_calls
.len
(),
has_analysis
=
analysis
.is_some
(),
partial_text_len
=
partial_text
.len
(),
...
...
@@ -347,7 +354,7 @@ pub async fn serve_harmony_responses(
mut
response
,
usage
,
}
=>
{
tracing
::
debug!
(
debug!
(
output_items
=
response
.output
.len
(),
input_tokens
=
usage
.prompt_tokens
,
output_tokens
=
usage
.completion_tokens
,
...
...
@@ -360,7 +367,7 @@ pub async fn serve_harmony_responses(
if
let
Some
(
tracking
)
=
mcp_tracking
{
inject_mcp_metadata
(
&
mut
response
,
&
tracking
,
&
ctx
.mcp_manager
);
tracing
::
debug!
(
debug!
(
mcp_calls
=
tracking
.total_calls
(),
output_items_after
=
response
.output
.len
(),
"Injected MCP metadata into final response"
...
...
@@ -375,6 +382,327 @@ pub async fn serve_harmony_responses(
}
}
/// Serve Harmony Responses API with streaming (SSE)
///
/// This is the streaming equivalent of `serve_harmony_responses()`.
/// Emits SSE events for lifecycle, MCP list_tools, and per-iteration streaming.
///
/// # Architecture
///
/// - Emits `response.created` and `response.in_progress` at start
/// - Emits `mcp_list_tools` events on first iteration (if MCP tools available)
/// - Loops through tool execution iterations (max 10)
/// - Calls `streaming::process_responses_iteration_stream()` for per-iteration events
/// - Emits `response.completed` at end
/// - Handles errors with `response.failed`
///
/// # Arguments
///
/// * `ctx` - Harmony responses context with pipeline and dependencies
/// * `request` - Responses API request
///
/// # Returns
///
/// SSE stream response with proper headers
pub
async
fn
serve_harmony_responses_stream
(
ctx
:
&
HarmonyResponsesContext
,
request
:
ResponsesRequest
,
)
->
Response
{
// Load previous conversation history if previous_response_id is set
let
mut
current_request
=
match
load_previous_messages
(
ctx
,
request
)
.await
{
Ok
(
req
)
=>
req
,
Err
(
err_response
)
=>
return
err_response
,
};
use
std
::
io
;
use
bytes
::
Bytes
;
// Create SSE channel
let
(
tx
,
rx
)
=
mpsc
::
unbounded_channel
();
let
stream
=
UnboundedReceiverStream
::
new
(
rx
);
// Create response event emitter
let
response_id
=
format!
(
"resp_{}"
,
Uuid
::
new_v4
());
let
model
=
current_request
.model
.clone
();
let
created_at
=
SystemTime
::
now
()
.duration_since
(
UNIX_EPOCH
)
.unwrap
()
.as_secs
();
let
mut
emitter
=
ResponseStreamEventEmitter
::
new
(
response_id
.clone
(),
model
,
created_at
);
// Clone context for spawned task
let
ctx_clone
=
ctx
.clone
();
// Spawn async task to handle streaming
tokio
::
spawn
(
async
move
{
let
ctx
=
&
ctx_clone
;
// Clone response_id for closure to avoid borrow conflicts
let
response_id_for_error
=
response_id
.clone
();
// Helper to emit error and return
let
emit_error
=
|
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
error_msg
:
&
str
|
{
// Create error event manually since emit_failed doesn't exist
let
event
=
serde_json
::
json!
({
"type"
:
"response.failed"
,
"response_id"
:
response_id_for_error
,
"error"
:
{
"message"
:
error_msg
,
"type"
:
"internal_error"
}
});
let
sse_data
=
format!
(
"data: {}
\n\n
"
,
serde_json
::
to_string
(
&
event
)
.unwrap
());
let
_
=
tx
.send
(
Ok
(
Bytes
::
from
(
sse_data
)));
};
// Emit initial response.created and response.in_progress events
let
event
=
emitter
.emit_created
();
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
let
event
=
emitter
.emit_in_progress
();
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
// Check if request has MCP tools
let
has_mcp_tools
=
current_request
.tools
.as_ref
()
.map
(|
tools
|
{
tools
.iter
()
.any
(|
t
|
matches!
(
t
.r
#
type
,
ResponseToolType
::
Mcp
))
})
.unwrap_or
(
false
);
// Initialize MCP call tracking
let
mut
mcp_tracking
=
if
has_mcp_tools
{
Some
(
McpCallTracking
::
new
(
"sglang-mcp"
.to_string
()))
}
else
{
None
};
// Setup MCP tools if needed
if
has_mcp_tools
{
// Ensure dynamic MCP client is registered
if
let
Some
(
tools
)
=
&
current_request
.tools
{
ensure_request_mcp_client
(
&
ctx
.mcp_manager
,
tools
)
.await
;
}
// Add static MCP tools from inventory
let
mcp_tools
=
ctx
.mcp_manager
.list_tools
();
if
!
mcp_tools
.is_empty
()
{
let
mcp_response_tools
=
convert_mcp_tools_to_response_tools
(
&
mcp_tools
);
let
mut
all_tools
=
current_request
.tools
.clone
()
.unwrap_or_default
();
all_tools
.extend
(
mcp_response_tools
);
current_request
.tools
=
Some
(
all_tools
);
debug!
(
mcp_tool_count
=
mcp_tools
.len
(),
total_tool_count
=
current_request
.tools
.as_ref
()
.map
(|
t
|
t
.len
())
.unwrap_or
(
0
),
"Added static MCP tools to Harmony Responses streaming request"
);
}
}
// Emit mcp_list_tools on first iteration (only if MCP tools available)
if
has_mcp_tools
{
let
mcp_tools
=
ctx
.mcp_manager
.list_tools
();
let
(
output_index
,
item_id
)
=
emitter
.allocate_output_index
(
OutputItemType
::
McpListTools
);
// Build tools list for item structure
let
tool_items
:
Vec
<
_
>
=
mcp_tools
.iter
()
.map
(|
t
|
{
use
serde_json
::{
json
,
Value
};
json!
({
"name"
:
t
.name
,
"description"
:
t
.description
,
"input_schema"
:
Value
::
Object
((
*
t
.input_schema
)
.clone
())
})
})
.collect
();
// Emit output_item.added
let
item
=
serde_json
::
json!
({
"id"
:
item_id
,
"type"
:
"mcp_list_tools"
,
"server_label"
:
"sglang-mcp"
,
"status"
:
"in_progress"
,
"tools"
:
[]
});
let
event
=
emitter
.emit_output_item_added
(
output_index
,
&
item
);
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
// Emit mcp_list_tools.in_progress
let
event
=
emitter
.emit_mcp_list_tools_in_progress
(
output_index
);
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
// Emit mcp_list_tools.completed
let
event
=
emitter
.emit_mcp_list_tools_completed
(
output_index
,
&
mcp_tools
);
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
// Emit output_item.done
let
item_done
=
serde_json
::
json!
({
"id"
:
item_id
,
"type"
:
"mcp_list_tools"
,
"server_label"
:
"sglang-mcp"
,
"status"
:
"completed"
,
"tools"
:
tool_items
});
let
event
=
emitter
.emit_output_item_done
(
output_index
,
&
item_done
);
if
emitter
.send_event
(
&
event
,
&
tx
)
.is_err
()
{
return
;
}
emitter
.complete_output_item
(
output_index
);
debug!
(
tool_count
=
mcp_tools
.len
(),
"Emitted mcp_list_tools on first iteration"
);
}
// Tool loop (max 10 iterations)
let
mut
iteration_count
=
0
;
loop
{
iteration_count
+=
1
;
// Safety check: prevent infinite loops
if
iteration_count
>
MAX_TOOL_ITERATIONS
{
let
error_msg
=
format!
(
"Maximum tool iterations ({}) exceeded"
,
MAX_TOOL_ITERATIONS
);
emit_error
(
&
tx
,
&
error_msg
);
return
;
}
debug!
(
iteration
=
iteration_count
,
"Harmony Responses streaming iteration"
);
// Execute through pipeline and get raw stream
let
execution_result
=
match
ctx
.pipeline
.execute_harmony_responses_streaming
(
&
current_request
,
ctx
)
.await
{
Ok
(
result
)
=>
result
,
Err
(
err_response
)
=>
{
let
error_msg
=
format!
(
"Pipeline execution failed: {:?}"
,
err_response
);
emit_error
(
&
tx
,
&
error_msg
);
return
;
}
};
// Process stream with token-level streaming using HarmonyStreamingProcessor
let
iteration_result
=
match
super
::
streaming
::
HarmonyStreamingProcessor
::
process_responses_iteration_stream
(
execution_result
,
&
mut
emitter
,
&
tx
,
)
.await
{
Ok
(
result
)
=>
result
,
Err
(
err_msg
)
=>
{
emit_error
(
&
tx
,
&
err_msg
);
return
;
}
};
// Handle iteration result (tool calls or completion)
match
iteration_result
{
ResponsesIterationResult
::
ToolCallsFound
{
tool_calls
,
analysis
,
partial_text
,
}
=>
{
debug!
(
tool_call_count
=
tool_calls
.len
(),
has_analysis
=
analysis
.is_some
(),
partial_text_len
=
partial_text
.len
(),
"Tool calls found in commentary channel"
);
// Execute MCP tools
let
tool_results
=
if
let
Some
(
ref
mut
tracking
)
=
mcp_tracking
{
match
execute_mcp_tools
(
&
ctx
.mcp_manager
,
&
tool_calls
,
tracking
)
.await
{
Ok
(
results
)
=>
results
,
Err
(
err_response
)
=>
{
let
error_msg
=
format!
(
"MCP tool execution failed: {:?}"
,
err_response
);
emit_error
(
&
tx
,
&
error_msg
);
return
;
}
}
}
else
{
let
error_msg
=
"Tool calls found but MCP tracking not initialized"
;
emit_error
(
&
tx
,
error_msg
);
return
;
};
// Build next request with appended history
current_request
=
match
build_next_request_with_tools
(
current_request
,
tool_calls
,
tool_results
,
analysis
,
partial_text
,
)
{
Ok
(
req
)
=>
req
,
Err
(
e
)
=>
{
let
error_msg
=
format!
(
"Failed to build next request: {:?}"
,
e
);
emit_error
(
&
tx
,
&
error_msg
);
return
;
}
};
// Continue loop
}
ResponsesIterationResult
::
Completed
{
response
,
usage
}
=>
{
debug!
(
output_items
=
response
.output
.len
(),
input_tokens
=
usage
.prompt_tokens
,
output_tokens
=
usage
.completion_tokens
,
"Harmony Responses streaming completed - no more tool calls"
);
// Emit response.completed with usage
let
usage_json
=
serde_json
::
json!
({
"prompt_tokens"
:
usage
.prompt_tokens
,
"completion_tokens"
:
usage
.completion_tokens
,
"total_tokens"
:
usage
.total_tokens
,
});
let
event
=
emitter
.emit_completed
(
Some
(
&
usage_json
));
emitter
.send_event_best_effort
(
&
event
,
&
tx
);
// Close channel
drop
(
tx
);
return
;
}
}
}
});
// Return SSE stream response
Response
::
builder
()
.status
(
StatusCode
::
OK
)
.header
(
"Content-Type"
,
"text/event-stream"
)
.header
(
"Cache-Control"
,
"no-cache"
)
.header
(
"Connection"
,
"keep-alive"
)
.body
(
Body
::
from_stream
(
stream
))
.unwrap
()
}
/// Execute MCP tools and collect results
///
/// Executes each tool call sequentially via the MCP manager.
...
...
@@ -397,7 +725,7 @@ async fn execute_mcp_tools(
let
mut
results
=
Vec
::
new
();
for
tool_call
in
tool_calls
{
tracing
::
debug!
(
debug!
(
tool_name
=
%
tool_call
.function.name
,
call_id
=
%
tool_call
.id
,
"Executing MCP tool"
...
...
@@ -425,7 +753,7 @@ async fn execute_mcp_tools(
.await
{
Ok
(
mcp_result
)
=>
{
tracing
::
debug!
(
debug!
(
tool_name
=
%
tool_call
.function.name
,
call_id
=
%
tool_call
.id
,
"Tool execution succeeded"
...
...
@@ -468,7 +796,7 @@ async fn execute_mcp_tools(
});
}
Err
(
e
)
=>
{
tracing
::
warn!
(
warn!
(
tool_name
=
%
tool_call
.function.name
,
call_id
=
%
tool_call
.id
,
error
=
%
e
,
...
...
@@ -656,7 +984,7 @@ struct ToolResult {
/// # Returns
///
/// Vector of ResponseTool entries in MCP format
fn
convert_mcp_tools_to_response_tools
(
mcp_tools
:
&
[
crate
::
mcp
::
Tool
])
->
Vec
<
ResponseTool
>
{
pub
fn
convert_mcp_tools_to_response_tools
(
mcp_tools
:
&
[
crate
::
mcp
::
Tool
])
->
Vec
<
ResponseTool
>
{
use
serde_json
::
Value
;
use
crate
::
protocols
::
responses
::
ResponseToolType
;
...
...
@@ -800,11 +1128,9 @@ async fn load_previous_messages(
.filter_map
(|
item
|
{
serde_json
::
from_value
::
<
ResponseInputOutputItem
>
(
item
.clone
())
.map_err
(|
e
|
{
tracing
::
warn!
(
warn!
(
"Failed to deserialize stored {} item: {}. Item: {}"
,
item_type
,
e
,
item
item_type
,
e
,
item
);
})
.ok
()
...
...
@@ -817,7 +1143,7 @@ async fn load_previous_messages(
history_items
.extend
(
deserialize_items
(
&
stored
.output
,
"output"
));
}
tracing
::
debug!
(
debug!
(
previous_response_id
=
%
prev_id_str
,
history_items_count
=
history_items
.len
(),
"Loaded conversation history from previous response"
...
...
@@ -851,29 +1177,3 @@ async fn load_previous_messages(
Ok
(
modified_request
)
}
// TODO: Implement streaming support
// /// Emit intermediate streaming chunks for analysis and partial text
// ///
// /// Emits SSE chunks for Responses API streaming:
// /// - Reasoning chunks for analysis channel
// /// - Message chunks for partial text from final channel
// ///
// /// # Arguments
// ///
// /// * `tx` - Streaming sender
// /// * `analysis` - Analysis channel content
// /// * `partial_text` - Final channel content
// /// * `iteration` - Current iteration number
// async fn emit_intermediate_chunks(
// tx: &tokio::sync::mpsc::UnboundedSender<Result<String, String>>,
// analysis: &Option<String>,
// partial_text: &str,
// iteration: usize,
// ) -> Result<(), Response> {
// // TODO: Implement streaming emission
// // - Emit reasoning chunks for analysis
// // - Emit message chunks for partial_text
// // - Follow OpenAI Responses streaming format (14 SSE event types)
// Ok(())
// }
sgl-router/src/routers/grpc/harmony/stages/response_processing.rs
View file @
700daa34
...
...
@@ -40,25 +40,26 @@ impl Default for HarmonyResponseProcessingStage {
#[async_trait]
impl
PipelineStage
for
HarmonyResponseProcessingStage
{
async
fn
execute
(
&
self
,
ctx
:
&
mut
RequestContext
)
->
Result
<
Option
<
Response
>
,
Response
>
{
// Get execution result (output tokens from model)
let
execution_result
=
ctx
.state
.response
.execution_result
.take
()
.ok_or_else
(||
utils
::
internal_error_static
(
"No execution result"
))
?
;
let
is_streaming
=
ctx
.is_streaming
();
let
dispatch
=
ctx
.state
.dispatch
.as_ref
()
.cloned
()
.ok_or_else
(||
utils
::
internal_error_static
(
"Dispatch metadata not set"
))
?
;
// Check request type to determine which processor method to call
match
&
ctx
.input.request_type
{
RequestType
::
Chat
(
_
)
=>
{
// Get execution result (output tokens from model)
let
execution_result
=
ctx
.state
.response
.execution_result
.take
()
.ok_or_else
(||
utils
::
internal_error_static
(
"No execution result"
))
?
;
let
dispatch
=
ctx
.state
.dispatch
.as_ref
()
.cloned
()
.ok_or_else
(||
utils
::
internal_error_static
(
"Dispatch metadata not set"
))
?
;
// For streaming, delegate to streaming processor and return SSE response
if
is_streaming
{
return
Ok
(
Some
(
...
...
@@ -83,14 +84,28 @@ impl PipelineStage for HarmonyResponseProcessingStage {
Ok
(
None
)
}
RequestType
::
Responses
(
_
)
=>
{
// For Responses API,
process iteration and store resul
t
//
Streaming not yet supported for Responses API
// For
streaming
Responses API,
leave execution_result in contex
t
//
for external streaming processor (serve_harmony_responses_stream)
if
is_streaming
{
return
Err
(
utils
::
internal_error_static
(
"Streaming not yet supported for Responses API"
,
));
// Don't take execution_result - let the caller handle it
return
Ok
(
None
);
}
// For non-streaming, process normally
let
execution_result
=
ctx
.state
.response
.execution_result
.take
()
.ok_or_else
(||
utils
::
internal_error_static
(
"No execution result"
))
?
;
let
dispatch
=
ctx
.state
.dispatch
.as_ref
()
.cloned
()
.ok_or_else
(||
utils
::
internal_error_static
(
"Dispatch metadata not set"
))
?
;
let
responses_request
=
ctx
.responses_request_arc
();
let
iteration_result
=
self
.processor
...
...
sgl-router/src/routers/grpc/harmony/streaming.rs
View file @
700daa34
...
...
@@ -16,20 +16,25 @@ use proto::{
use
serde_json
::
json
;
use
tokio
::
sync
::
mpsc
;
use
tokio_stream
::{
wrappers
::
UnboundedReceiverStream
,
StreamExt
};
use
tracing
::
error
;
use
tracing
::
{
debug
,
error
}
;
use
super
::{
types
::
HarmonyChannelDelta
,
HarmonyParserAdapter
};
use
super
::{
processor
::
ResponsesIterationResult
,
types
::
HarmonyChannelDelta
,
HarmonyParserAdapter
,
};
use
crate
::{
grpc_client
::{
proto
,
sglang_scheduler
::
AbortOnDropStream
},
protocols
::{
chat
::{
ChatCompletionRequest
,
ChatCompletionStreamResponse
,
ChatMessageDelta
,
ChatStreamChoice
,
},
common
::{
FunctionCallDelta
,
ToolCallDelta
,
Usage
},
common
::{
FunctionCallDelta
,
ToolCall
,
ToolCallDelta
,
Usage
},
responses
::{
ResponseStatus
,
ResponseUsage
,
ResponsesResponse
,
ResponsesUsage
},
},
routers
::
grpc
::{
context
,
responses
::
streaming
::{
OutputItemType
,
ResponseStreamEventEmitter
},
},
routers
::
grpc
::
context
,
};
/// Processor for streaming Harmony responses
///
/// Returns an SSE stream that parses Harmony tokens incrementally and
...
...
@@ -526,6 +531,425 @@ impl HarmonyStreamingProcessor {
Ok
(())
}
/// Common decode stream processing logic for both single and dual stream modes
///
/// This helper function contains the shared logic for processing the decode stream,
/// parsing Harmony tokens, emitting SSE events, and tracking state.
async
fn
process_decode_stream_common
(
mut
decode_stream
:
AbortOnDropStream
,
emitter
:
&
mut
ResponseStreamEventEmitter
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
)
->
Result
<
ResponsesIterationResult
,
String
>
{
// Initialize Harmony parser for this iteration
let
mut
parser
=
HarmonyParserAdapter
::
new
()
.map_err
(|
e
|
format!
(
"Failed to create parser: {}"
,
e
))
?
;
// State tracking for channels
let
mut
has_analysis
=
false
;
let
mut
accumulated_final_text
=
String
::
new
();
let
mut
accumulated_tool_calls
:
Option
<
Vec
<
ToolCall
>>
=
None
;
// Track which items we've started
let
mut
reasoning_output_index
:
Option
<
usize
>
=
None
;
let
mut
message_output_index
:
Option
<
usize
>
=
None
;
let
mut
message_item_id
:
Option
<
String
>
=
None
;
let
mut
has_emitted_content_part_added
=
false
;
// MCP tool call tracking (call_index -> (output_index, item_id))
let
mut
mcp_call_tracking
:
HashMap
<
usize
,
(
usize
,
String
)
>
=
HashMap
::
new
();
// Metadata from Complete message
let
mut
finish_reason
=
String
::
from
(
"stop"
);
let
mut
matched_stop
:
Option
<
serde_json
::
Value
>
=
None
;
// Process stream
let
mut
chunk_count
=
0
;
while
let
Some
(
result
)
=
decode_stream
.next
()
.await
{
chunk_count
+=
1
;
let
response
=
result
.map_err
(|
e
|
format!
(
"Decode stream error: {}"
,
e
))
?
;
match
response
.response
{
Some
(
Chunk
(
chunk
))
=>
{
// Parse chunk via Harmony parser
let
delta_result
=
parser
.parse_chunk
(
&
chunk
.token_ids
)
.map_err
(|
e
|
format!
(
"Parse error: {}"
,
e
))
?
;
// Emit SSE events if there's a delta
if
let
Some
(
delta
)
=
delta_result
{
// Analysis channel → Reasoning item (wrapper events only, emitted once)
if
let
Some
(
_
analysis_text
)
=
&
delta
.analysis_delta
{
if
reasoning_output_index
.is_none
()
{
// Allocate reasoning item and emit wrapper events
let
(
output_index
,
_
item_id
)
=
emitter
.allocate_output_index
(
OutputItemType
::
Reasoning
);
reasoning_output_index
=
Some
(
output_index
);
// Emit reasoning item (added + done in one call)
// Note: reasoning_content will be provided at finalize
emitter
.emit_reasoning_item
(
tx
,
None
)
.map_err
(|
e
|
format!
(
"Failed to emit reasoning item: {}"
,
e
))
?
;
has_analysis
=
true
;
}
}
// Final channel → Message item (WITH text streaming)
if
let
Some
(
final_delta
)
=
&
delta
.final_delta
{
if
!
final_delta
.is_empty
()
{
// Allocate message item if needed
if
message_output_index
.is_none
()
{
let
(
output_index
,
item_id
)
=
emitter
.allocate_output_index
(
OutputItemType
::
Message
);
message_output_index
=
Some
(
output_index
);
message_item_id
=
Some
(
item_id
.clone
());
// Build message item structure
let
item
=
json!
({
"id"
:
item_id
,
"type"
:
"message"
,
"role"
:
"assistant"
,
"content"
:
[]
});
// Emit output_item.added
let
event
=
emitter
.emit_output_item_added
(
output_index
,
&
item
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
}
let
output_index
=
message_output_index
.unwrap
();
let
item_id
=
message_item_id
.as_ref
()
.unwrap
();
let
content_index
=
0
;
// Single content part
// Emit content_part.added before first delta
if
!
has_emitted_content_part_added
{
let
event
=
emitter
.emit_content_part_added
(
output_index
,
item_id
,
content_index
,
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
has_emitted_content_part_added
=
true
;
}
// Emit text delta
let
event
=
emitter
.emit_text_delta
(
final_delta
,
output_index
,
item_id
,
content_index
,
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
accumulated_final_text
.push_str
(
final_delta
);
}
}
// Commentary channel → MCP tool call streaming
if
let
Some
(
tc_delta
)
=
&
delta
.commentary_delta
{
let
call_index
=
tc_delta
.index
;
// Check if this is a new tool call (has id and name)
if
tc_delta
.id
.is_some
()
{
// NEW MCP CALL: Allocate output item and emit in_progress
let
(
output_index
,
item_id
)
=
emitter
.allocate_output_index
(
OutputItemType
::
McpCall
);
// Store tracking info
mcp_call_tracking
.insert
(
call_index
,
(
output_index
,
item_id
.clone
()));
// Emit mcp_call.in_progress
let
event
=
emitter
.emit_mcp_call_in_progress
(
output_index
,
&
item_id
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// If we have function name, emit initial mcp_call_arguments.delta
if
let
Some
(
func
)
=
&
tc_delta
.function
{
if
func
.name
.is_some
()
{
let
event
=
emitter
.emit_mcp_call_arguments_delta
(
output_index
,
&
item_id
,
""
,
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
}
}
}
else
{
// CONTINUING MCP CALL: Emit arguments delta
if
let
Some
((
output_index
,
item_id
))
=
mcp_call_tracking
.get
(
&
call_index
)
{
if
let
Some
(
args
)
=
tc_delta
.function
.as_ref
()
.and_then
(|
f
|
f
.arguments
.as_ref
())
.filter
(|
a
|
!
a
.is_empty
())
{
let
event
=
emitter
.emit_mcp_call_arguments_delta
(
*
output_index
,
item_id
,
args
,
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
}
}
}
}
}
}
Some
(
Complete
(
complete
))
=>
{
// Store final metadata
finish_reason
=
complete
.finish_reason
.clone
();
matched_stop
=
complete
.matched_stop
.as_ref
()
.map
(|
m
|
match
m
{
MatchedTokenId
(
id
)
=>
{
serde_json
::
json!
(
id
)
}
MatchedStopStr
(
s
)
=>
{
serde_json
::
json!
(
s
)
}
});
// Finalize parser and get complete output
let
final_output
=
parser
.finalize
(
finish_reason
.clone
(),
matched_stop
.clone
())
.map_err
(|
e
|
format!
(
"Finalize error: {}"
,
e
))
?
;
// Store finalized tool calls
accumulated_tool_calls
=
final_output
.commentary
.clone
();
// Complete all MCP tool calls if we have commentary
if
let
Some
(
ref
tool_calls
)
=
accumulated_tool_calls
{
for
(
call_idx
,
tool_call
)
in
tool_calls
.iter
()
.enumerate
()
{
if
let
Some
((
output_index
,
item_id
))
=
mcp_call_tracking
.get
(
&
call_idx
)
{
// Emit mcp_call_arguments.done with final arguments
let
args_str
=
tool_call
.function.arguments
.as_deref
()
.unwrap_or
(
""
);
let
event
=
emitter
.emit_mcp_call_arguments_done
(
*
output_index
,
item_id
,
args_str
,
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// Emit mcp_call.completed
let
event
=
emitter
.emit_mcp_call_completed
(
*
output_index
,
item_id
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// Mark output item as completed
emitter
.complete_output_item
(
*
output_index
);
}
}
}
// Close message item if we opened one
if
let
Some
(
output_index
)
=
message_output_index
{
let
item_id
=
message_item_id
.as_ref
()
.unwrap
();
let
content_index
=
0
;
// Emit text_done
let
event
=
emitter
.emit_text_done
(
output_index
,
item_id
,
content_index
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// Emit content_part.done
let
event
=
emitter
.emit_content_part_done
(
output_index
,
item_id
,
content_index
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// Emit output_item.done
let
item
=
json!
({
"id"
:
item_id
,
"type"
:
"message"
,
"role"
:
"assistant"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
accumulated_final_text
.clone
()
}]
});
let
event
=
emitter
.emit_output_item_done
(
output_index
,
&
item
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
emitter
.complete_output_item
(
output_index
);
}
}
Some
(
proto
::
generate_response
::
Response
::
Error
(
err
))
=>
{
return
Err
(
format!
(
"Server error: {}"
,
err
.message
));
}
None
=>
{}
}
}
debug!
(
"Stream loop ended. Total chunks received: {}, has_analysis: {}, tool_calls: {}, final_text_len: {}"
,
chunk_count
,
has_analysis
,
accumulated_tool_calls
.as_ref
()
.map
(|
tc
|
tc
.len
())
.unwrap_or
(
0
),
accumulated_final_text
.len
()
);
// Extract tool calls from completed messages or incomplete commentary
if
chunk_count
>
0
&&
accumulated_tool_calls
.is_none
()
{
let
messages
=
parser
.get_messages
();
// Try extracting from completed messages first
let
(
analysis_opt
,
commentary_opt
,
final_text_extracted
)
=
HarmonyParserAdapter
::
parse_messages
(
&
messages
);
accumulated_tool_calls
=
commentary_opt
.clone
();
// If no tool calls found, check for incomplete commentary in parser state
if
accumulated_tool_calls
.is_none
()
{
accumulated_tool_calls
=
parser
.extract_incomplete_commentary
();
}
debug!
(
"Tool call extraction: completed_msgs={}, tool_calls={}, has_analysis={}, final_text_len={}"
,
messages
.len
(),
accumulated_tool_calls
.as_ref
()
.map
(|
tc
|
tc
.len
())
.unwrap_or
(
0
),
analysis_opt
.is_some
(),
final_text_extracted
.len
()
);
// Complete any pending MCP tool calls with data from completed messages
if
let
Some
(
ref
tool_calls
)
=
accumulated_tool_calls
{
for
(
call_idx
,
tool_call
)
in
tool_calls
.iter
()
.enumerate
()
{
if
let
Some
((
output_index
,
item_id
))
=
mcp_call_tracking
.get
(
&
call_idx
)
{
// Emit mcp_call_arguments.done with final arguments
let
args_str
=
tool_call
.function.arguments
.as_deref
()
.unwrap_or
(
""
);
let
event
=
emitter
.emit_mcp_call_arguments_done
(
*
output_index
,
item_id
,
args_str
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
// Emit mcp_call.completed
let
event
=
emitter
.emit_mcp_call_completed
(
*
output_index
,
item_id
);
emitter
.send_event_best_effort
(
&
event
,
tx
);
}
}
}
}
// Mark stream as completed successfully to prevent abort on drop
decode_stream
.mark_completed
();
// Return result based on whether tool calls were found
if
let
Some
(
tool_calls
)
=
accumulated_tool_calls
{
if
!
tool_calls
.is_empty
()
{
let
analysis_content
=
if
has_analysis
{
// Get analysis from finalized parser output by calling finalize again
// This is safe because finalize can be called multiple times
let
output
=
parser
.finalize
(
finish_reason
.clone
(),
matched_stop
.clone
())
?
;
output
.analysis
}
else
{
None
};
return
Ok
(
ResponsesIterationResult
::
ToolCallsFound
{
tool_calls
,
analysis
:
analysis_content
,
partial_text
:
accumulated_final_text
,
});
}
}
// For streaming, we don't build the full ResponsesResponse here
// The caller will build it from the SSE events
// Return a placeholder Completed result (caller ignores these fields in streaming mode)
Ok
(
ResponsesIterationResult
::
Completed
{
response
:
Box
::
new
(
ResponsesResponse
{
id
:
String
::
new
(),
object
:
"response"
.to_string
(),
created_at
:
0
,
status
:
ResponseStatus
::
Completed
,
error
:
None
,
incomplete_details
:
None
,
instructions
:
None
,
max_output_tokens
:
None
,
model
:
String
::
new
(),
output
:
vec!
[],
parallel_tool_calls
:
true
,
previous_response_id
:
None
,
reasoning
:
None
,
store
:
true
,
temperature
:
None
,
text
:
None
,
tool_choice
:
"auto"
.to_string
(),
tools
:
vec!
[],
top_p
:
None
,
truncation
:
None
,
user
:
None
,
metadata
:
HashMap
::
new
(),
usage
:
Some
(
ResponsesUsage
::
Modern
(
ResponseUsage
{
input_tokens
:
0
,
output_tokens
:
0
,
total_tokens
:
0
,
input_tokens_details
:
None
,
output_tokens_details
:
None
,
})),
}),
usage
:
Usage
{
prompt_tokens
:
0
,
completion_tokens
:
0
,
total_tokens
:
0
,
completion_tokens_details
:
None
,
},
})
}
/// Process streaming chunks for Responses API iteration
///
/// Returns ResponsesIterationResult indicating whether tool calls were found
/// (requiring MCP loop continuation) or if the iteration is complete.
pub
async
fn
process_responses_iteration_stream
(
execution_result
:
context
::
ExecutionResult
,
emitter
:
&
mut
ResponseStreamEventEmitter
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
)
->
Result
<
ResponsesIterationResult
,
String
>
{
match
execution_result
{
context
::
ExecutionResult
::
Single
{
stream
}
=>
{
debug!
(
"Processing Responses API single stream mode"
);
Self
::
process_responses_single_stream
(
stream
,
emitter
,
tx
)
.await
}
context
::
ExecutionResult
::
Dual
{
prefill
,
decode
}
=>
{
debug!
(
"Processing Responses API dual stream mode"
);
Self
::
process_responses_dual_stream
(
prefill
,
*
decode
,
emitter
,
tx
)
.await
}
}
}
/// Process streaming chunks from a single stream (Responses API)
async
fn
process_responses_single_stream
(
grpc_stream
:
AbortOnDropStream
,
emitter
:
&
mut
ResponseStreamEventEmitter
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
)
->
Result
<
ResponsesIterationResult
,
String
>
{
// Delegate to common helper
Self
::
process_decode_stream_common
(
grpc_stream
,
emitter
,
tx
)
.await
}
/// Process streaming chunks from dual streams (Responses API)
async
fn
process_responses_dual_stream
(
mut
prefill_stream
:
AbortOnDropStream
,
decode_stream
:
AbortOnDropStream
,
emitter
:
&
mut
ResponseStreamEventEmitter
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
io
::
Error
>>
,
)
->
Result
<
ResponsesIterationResult
,
String
>
{
// Phase 1: Process prefill stream (collect metadata, no output)
while
let
Some
(
result
)
=
prefill_stream
.next
()
.await
{
let
_
response
=
result
.map_err
(|
e
|
format!
(
"Prefill stream error: {}"
,
e
))
?
;
// No-op for prefill in Responses API (just metadata collection)
}
// Phase 2: Process decode stream using common helper
let
result
=
Self
::
process_decode_stream_common
(
decode_stream
,
emitter
,
tx
)
.await
;
// Mark prefill stream as completed AFTER decode completes successfully
// This ensures that if client disconnects during decode, BOTH streams send abort
prefill_stream
.mark_completed
();
result
}
/// Build SSE response from receiver
fn
build_sse_response
(
rx
:
mpsc
::
UnboundedReceiver
<
Result
<
Bytes
,
io
::
Error
>>
)
->
Response
{
let
stream
=
UnboundedReceiverStream
::
new
(
rx
);
...
...
sgl-router/src/routers/grpc/pipeline.rs
View file @
700daa34
...
...
@@ -454,4 +454,53 @@ impl RequestPipeline {
utils
::
internal_error_static
(
"No ResponsesIterationResult produced by pipeline"
)
})
}
/// Execute Harmony Responses pipeline iteration with streaming support
///
/// This version executes the pipeline up to the dispatch stage and returns
/// the raw ExecutionResult (with stream) for token-level streaming processing.
pub
async
fn
execute_harmony_responses_streaming
(
&
self
,
request
:
&
crate
::
protocols
::
responses
::
ResponsesRequest
,
harmony_ctx
:
&
harmony
::
responses
::
HarmonyResponsesContext
,
)
->
Result
<
ExecutionResult
,
Response
>
{
// Create RequestContext for this Responses request
let
mut
ctx
=
RequestContext
::
for_responses
(
Arc
::
new
(
request
.clone
()),
None
,
None
,
harmony_ctx
.components
.clone
(),
);
// Execute pipeline stages up to dispatch (which creates the stream)
for
(
idx
,
stage
)
in
self
.stages
.iter
()
.enumerate
()
{
match
stage
.execute
(
&
mut
ctx
)
.await
{
Ok
(
Some
(
response
))
=>
{
error!
(
"Stage {} ({}) returned unexpected response during streaming Responses"
,
idx
+
1
,
stage
.name
()
);
return
Err
(
response
);
}
Ok
(
None
)
=>
continue
,
Err
(
response
)
=>
{
error!
(
"Stage {} ({}) failed with status {}"
,
idx
+
1
,
stage
.name
(),
response
.status
()
);
return
Err
(
response
);
}
}
}
// Extract execution_result (the raw stream from workers)
ctx
.state
.response
.execution_result
.take
()
.ok_or_else
(||
utils
::
internal_error_static
(
"No ExecutionResult produced by pipeline"
))
}
}
sgl-router/src/routers/grpc/responses/streaming.rs
View file @
700daa34
...
...
@@ -9,7 +9,7 @@ use uuid::Uuid;
use
crate
::
protocols
::
chat
::
ChatCompletionStreamResponse
;
pub
(
super
)
enum
OutputItemType
{
pub
enum
OutputItemType
{
Message
,
McpListTools
,
McpCall
,
...
...
@@ -53,9 +53,9 @@ struct OutputItemState {
/// - response.mcp_call_arguments.done
/// - response.mcp_call.completed
/// - response.mcp_call.failed
pub
(
super
)
struct
ResponseStreamEventEmitter
{
pub
struct
ResponseStreamEventEmitter
{
sequence_number
:
u64
,
response_id
:
String
,
pub
response_id
:
String
,
model
:
String
,
created_at
:
u64
,
message_id
:
String
,
...
...
@@ -74,7 +74,7 @@ pub(super) struct ResponseStreamEventEmitter {
}
impl
ResponseStreamEventEmitter
{
pub
(
super
)
fn
new
(
response_id
:
String
,
model
:
String
,
created_at
:
u64
)
->
Self
{
pub
fn
new
(
response_id
:
String
,
model
:
String
,
created_at
:
u64
)
->
Self
{
let
message_id
=
format!
(
"msg_{}"
,
Uuid
::
new_v4
());
Self
{
...
...
@@ -102,7 +102,7 @@ impl ResponseStreamEventEmitter {
seq
}
pub
(
super
)
fn
emit_created
(
&
mut
self
)
->
serde_json
::
Value
{
pub
fn
emit_created
(
&
mut
self
)
->
serde_json
::
Value
{
self
.has_emitted_created
=
true
;
json!
({
"type"
:
"response.created"
,
...
...
@@ -118,7 +118,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_in_progress
(
&
mut
self
)
->
serde_json
::
Value
{
pub
fn
emit_in_progress
(
&
mut
self
)
->
serde_json
::
Value
{
self
.has_emitted_in_progress
=
true
;
json!
({
"type"
:
"response.in_progress"
,
...
...
@@ -131,7 +131,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_content_part_added
(
pub
fn
emit_content_part_added
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -151,7 +151,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_text_delta
(
pub
fn
emit_text_delta
(
&
mut
self
,
delta
:
&
str
,
output_index
:
usize
,
...
...
@@ -169,7 +169,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_text_done
(
pub
fn
emit_text_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -185,7 +185,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_content_part_done
(
pub
fn
emit_content_part_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -204,10 +204,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_completed
(
&
mut
self
,
usage
:
Option
<&
serde_json
::
Value
>
,
)
->
serde_json
::
Value
{
pub
fn
emit_completed
(
&
mut
self
,
usage
:
Option
<&
serde_json
::
Value
>
)
->
serde_json
::
Value
{
let
mut
response
=
json!
({
"type"
:
"response.completed"
,
"sequence_number"
:
self
.next_sequence
(),
...
...
@@ -240,10 +237,7 @@ impl ResponseStreamEventEmitter {
// MCP Event Emission Methods
// ========================================================================
pub
(
super
)
fn
emit_mcp_list_tools_in_progress
(
&
mut
self
,
output_index
:
usize
,
)
->
serde_json
::
Value
{
pub
fn
emit_mcp_list_tools_in_progress
(
&
mut
self
,
output_index
:
usize
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_list_tools.in_progress"
,
"sequence_number"
:
self
.next_sequence
(),
...
...
@@ -251,7 +245,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_mcp_list_tools_completed
(
pub
fn
emit_mcp_list_tools_completed
(
&
mut
self
,
output_index
:
usize
,
tools
:
&
[
crate
::
mcp
::
Tool
],
...
...
@@ -275,7 +269,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_mcp_call_in_progress
(
pub
fn
emit_mcp_call_in_progress
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -288,7 +282,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_mcp_call_arguments_delta
(
pub
fn
emit_mcp_call_arguments_delta
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -309,7 +303,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_mcp_call_arguments_done
(
pub
fn
emit_mcp_call_arguments_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -324,7 +318,7 @@ impl ResponseStreamEventEmitter {
})
}
pub
(
super
)
fn
emit_mcp_call_completed
(
pub
fn
emit_mcp_call_completed
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
...
...
@@ -357,7 +351,7 @@ impl ResponseStreamEventEmitter {
// ========================================================================
/// Emit response.output_item.added event
pub
(
super
)
fn
emit_output_item_added
(
pub
fn
emit_output_item_added
(
&
mut
self
,
output_index
:
usize
,
item
:
&
serde_json
::
Value
,
...
...
@@ -371,7 +365,7 @@ impl ResponseStreamEventEmitter {
}
/// Emit response.output_item.done event
pub
(
super
)
fn
emit_output_item_done
(
pub
fn
emit_output_item_done
(
&
mut
self
,
output_index
:
usize
,
item
:
&
serde_json
::
Value
,
...
...
@@ -390,7 +384,7 @@ impl ResponseStreamEventEmitter {
}
/// Allocate next output index and track item
pub
(
super
)
fn
allocate_output_index
(
&
mut
self
,
item_type
:
OutputItemType
)
->
(
usize
,
String
)
{
pub
fn
allocate_output_index
(
&
mut
self
,
item_type
:
OutputItemType
)
->
(
usize
,
String
)
{
let
index
=
self
.next_output_index
;
self
.next_output_index
+=
1
;
...
...
@@ -412,7 +406,7 @@ impl ResponseStreamEventEmitter {
}
/// Mark output item as completed
pub
(
super
)
fn
complete_output_item
(
&
mut
self
,
output_index
:
usize
)
{
pub
fn
complete_output_item
(
&
mut
self
,
output_index
:
usize
)
{
if
let
Some
(
item
)
=
self
.output_items
.iter_mut
()
...
...
@@ -426,7 +420,7 @@ impl ResponseStreamEventEmitter {
///
/// Reasoning items in OpenAI format are simple placeholders emitted between tool iterations.
/// They don't have streaming content - just wrapper events with empty/null content.
pub
(
super
)
fn
emit_reasoning_item
(
pub
fn
emit_reasoning_item
(
&
mut
self
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
reasoning_content
:
Option
<
String
>
,
...
...
@@ -550,7 +544,7 @@ impl ResponseStreamEventEmitter {
Ok
(())
}
pub
(
super
)
fn
send_event
(
pub
fn
send_event
(
&
self
,
event
:
&
serde_json
::
Value
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
...
...
@@ -558,13 +552,38 @@ impl ResponseStreamEventEmitter {
let
event_json
=
serde_json
::
to_string
(
event
)
.map_err
(|
e
|
format!
(
"Failed to serialize event: {}"
,
e
))
?
;
if
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
event_json
))))
.is_err
()
{
// Extract event type from the JSON for SSE event field
let
event_type
=
event
.get
(
"type"
)
.and_then
(|
v
|
v
.as_str
())
.unwrap_or
(
"message"
);
// Format as SSE with event: field
let
sse_message
=
format!
(
"event: {}
\n
data: {}
\n\n
"
,
event_type
,
event_json
);
if
tx
.send
(
Ok
(
Bytes
::
from
(
sse_message
)))
.is_err
()
{
return
Err
(
"Client disconnected"
.to_string
());
}
Ok
(())
}
/// Send event and log any errors (typically client disconnect)
///
/// This is a convenience method for streaming scenarios where client
/// disconnection is expected and should be logged but not fail the operation.
/// Returns true if sent successfully, false if client disconnected.
pub
fn
send_event_best_effort
(
&
self
,
event
:
&
serde_json
::
Value
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
)
->
bool
{
match
self
.send_event
(
event
,
tx
)
{
Ok
(())
=>
true
,
Err
(
e
)
=>
{
tracing
::
debug!
(
"Failed to send event (likely client disconnect): {}"
,
e
);
false
}
}
}
}
sgl-router/src/routers/grpc/router.rs
View file @
700daa34
...
...
@@ -13,7 +13,10 @@ use tracing::debug;
use
super
::{
context
::
SharedComponents
,
harmony
::{
serve_harmony_responses
,
HarmonyDetector
,
HarmonyResponsesContext
},
harmony
::{
serve_harmony_responses
,
serve_harmony_responses_stream
,
HarmonyDetector
,
HarmonyResponsesContext
,
},
pipeline
::
RequestPipeline
,
responses
,
};
...
...
@@ -192,8 +195,8 @@ impl GrpcRouter {
model_id
:
Option
<&
str
>
,
)
->
Response
{
debug!
(
"Processing Harmony responses request for model: {:?}"
,
model_id
"Processing Harmony responses request for model:
{:?}, streaming:
{:?}"
,
model_id
,
body
.stream
);
// Create HarmonyResponsesContext from existing responses context
...
...
@@ -204,10 +207,15 @@ impl GrpcRouter {
self
.harmony_responses_context.response_storage
.clone
(),
);
// Use serve_harmony_responses for multi-turn MCP tool orchestration
match
serve_harmony_responses
(
&
harmony_ctx
,
body
.clone
())
.await
{
Ok
(
response
)
=>
axum
::
Json
(
response
)
.into_response
(),
Err
(
error_response
)
=>
error_response
,
// Check if streaming is requested
if
body
.stream
.unwrap_or
(
false
)
{
serve_harmony_responses_stream
(
&
harmony_ctx
,
body
.clone
())
.await
}
else
{
// Use non-streaming version for standard JSON responses
match
serve_harmony_responses
(
&
harmony_ctx
,
body
.clone
())
.await
{
Ok
(
response
)
=>
axum
::
Json
(
response
)
.into_response
(),
Err
(
error_response
)
=>
error_response
,
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment