"components/vscode:/vscode.git/clone" did not exist on "24523a1c297f33ded512127c990b0b7bf2251bf2"
Unverified Commit 5178a4a4 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

feat: streaming tool call and reasoning dispatch SSE events (#7114)


Signed-off-by: default avatarMatej Kosec <mkosec@nvidia.com>
parent d16862ad
......@@ -73,6 +73,8 @@ class FrontendConfig(KvRouterConfigBase):
enable_anthropic_api: bool
strip_anthropic_preamble: bool
debug_perf: bool
enable_streaming_tool_dispatch: bool
enable_streaming_reasoning_dispatch: bool
preprocess_workers: int
def validate(self) -> None:
......@@ -355,6 +357,30 @@ class FrontendArgGroup(ArgGroup):
"from the system prompt. Saves tokens and improves prompt caching."
),
)
add_negatable_bool_argument(
g,
flag_name="--enable-streaming-tool-dispatch",
env_var="DYN_ENABLE_STREAMING_TOOL_DISPATCH",
default=False,
help=(
"[EXPERIMENTAL] Enable streaming tool call dispatch. Emits "
"'event: tool_call_dispatch' SSE events on /v1/chat/completions "
"for each complete tool call before finish_reason arrives. "
"Can be combined with --enable-streaming-reasoning-dispatch."
),
)
add_negatable_bool_argument(
g,
flag_name="--enable-streaming-reasoning-dispatch",
env_var="DYN_ENABLE_STREAMING_REASONING_DISPATCH",
default=False,
help=(
"[EXPERIMENTAL] Enable streaming reasoning dispatch. Emits a "
"single 'event: reasoning_dispatch' SSE event on /v1/chat/completions "
"with the complete reasoning block once thinking ends. "
"Can be combined with --enable-streaming-tool-dispatch."
),
)
add_argument(
g,
flag_name="--dyn-chat-processor",
......
......@@ -264,6 +264,16 @@ async def async_main():
else:
os.environ.pop("DYN_STRIP_ANTHROPIC_PREAMBLE", None)
if config.enable_streaming_tool_dispatch:
os.environ["DYN_ENABLE_STREAMING_TOOL_DISPATCH"] = "1"
else:
os.environ.pop("DYN_ENABLE_STREAMING_TOOL_DISPATCH", None)
if config.enable_streaming_reasoning_dispatch:
os.environ["DYN_ENABLE_STREAMING_REASONING_DISPATCH"] = "1"
else:
os.environ.pop("DYN_ENABLE_STREAMING_REASONING_DISPATCH", None)
if config.chat_processor == "vllm":
assert (
vllm_flags is not None
......
This diff is collapsed.
......@@ -154,6 +154,26 @@ impl State {
pub fn sse_keep_alive(&self) -> Option<Duration> {
None
}
/// Returns true if streaming tool call dispatch is enabled via
/// [`env_llm::DYN_ENABLE_STREAMING_TOOL_DISPATCH`].
///
/// When enabled, the chat completions streaming path emits `event: tool_call_dispatch`
/// SSE events for each complete tool call, letting clients start processing tool calls
/// before `finish_reason="tool_calls"` arrives.
pub fn streaming_tool_dispatch_enabled(&self) -> bool {
env_is_truthy(env_llm::DYN_ENABLE_STREAMING_TOOL_DISPATCH)
}
/// Returns true if streaming reasoning dispatch is enabled via
/// [`env_llm::DYN_ENABLE_STREAMING_REASONING_DISPATCH`].
///
/// When enabled, the chat completions streaming path accumulates reasoning tokens and
/// emits a single `event: reasoning_dispatch` SSE event with the complete reasoning
/// block once thinking ends (DeepSeek-R1, Qwen3, etc.).
pub fn streaming_reasoning_dispatch_enabled(&self) -> bool {
env_is_truthy(env_llm::DYN_ENABLE_STREAMING_REASONING_DISPATCH)
}
}
#[derive(Clone)]
......
......@@ -50,6 +50,9 @@ struct ToolCallState {
accumulated_args: String,
block_index: u32,
started: bool,
/// Set when `content_block_stop` has already been emitted inline
/// (complete tool call detected mid-stream). Prevents duplicate stop in `emit_end_events()`.
stopped: bool,
}
impl AnthropicStreamConverter {
......@@ -261,6 +264,7 @@ impl AnthropicStreamConverter {
accumulated_args: String::new(),
block_index,
started: false,
stopped: false,
});
}
......@@ -313,6 +317,20 @@ impl AnthropicStreamConverter {
},
};
events.push(make_sse_event("content_block_delta", &block_delta));
// Emit content_block_stop immediately if the tool call arrived
// complete in a single chunk (id + name + args all present).
// Dynamo backends emit complete tool calls, so this fires on the
// same chunk — no need to wait for finish_reason.
if tc.id.is_some()
&& func.name.is_some()
&& !self.tool_call_states[tc_index].stopped
{
self.tool_call_states[tc_index].stopped = true;
let block_stop =
AnthropicStreamEvent::ContentBlockStop { index: block_index };
events.push(make_sse_event("content_block_stop", &block_stop));
}
}
}
}
......@@ -350,9 +368,9 @@ impl AnthropicStreamConverter {
events.push(make_sse_event("content_block_stop", &block_stop));
}
// Close tool call blocks
// Close tool call blocks (skip any already stopped inline)
for tc in &self.tool_call_states {
if tc.started {
if tc.started && !tc.stopped {
let block_stop = AnthropicStreamEvent::ContentBlockStop {
index: tc.block_index,
};
......@@ -569,6 +587,7 @@ impl AnthropicStreamConverter {
accumulated_args: String::new(),
block_index,
started: false,
stopped: false,
});
}
if let Some(id) = &tc.id {
......@@ -611,6 +630,20 @@ impl AnthropicStreamConverter {
},
};
events.push(make_tagged_event("content_block_delta", &ev));
// Emit content_block_stop immediately if the tool call arrived
// complete in a single chunk (id + name + args all present).
// Dynamo backends emit complete tool calls, so this fires on the
// same chunk — no need to wait for finish_reason.
if tc.id.is_some()
&& func.name.is_some()
&& !self.tool_call_states[tc_index].stopped
{
self.tool_call_states[tc_index].stopped = true;
let ev =
AnthropicStreamEvent::ContentBlockStop { index: block_index };
events.push(make_tagged_event("content_block_stop", &ev));
}
}
}
}
......@@ -647,8 +680,9 @@ impl AnthropicStreamConverter {
events.push(make_tagged_event("content_block_stop", &ev));
}
// Skip already-stopped tool call blocks
for tc in &self.tool_call_states {
if tc.started {
if tc.started && !tc.stopped {
let ev = AnthropicStreamEvent::ContentBlockStop {
index: tc.block_index,
};
......@@ -788,9 +822,10 @@ mod tests {
vec![
"content_block_stop",
"content_block_start",
"content_block_delta"
"content_block_delta",
"content_block_stop",
],
"text block must be closed before tool block starts"
"text block must be closed before tool block starts; complete tool call stopped inline"
);
// Verify indices: stop=0 (text), start=1 (tool)
......@@ -814,17 +849,13 @@ mod tests {
other => panic!("expected ContentBlockStart, got {other:?}"),
}
// End events should NOT duplicate the text block stop
// End events should NOT duplicate either stop (both already emitted inline)
let end_events = conv.emit_end_events_tagged();
assert_eq!(
event_types(&end_events),
vec!["content_block_stop", "message_delta", "message_stop"],
"only tool block stop in end events (text already closed)"
vec!["message_delta", "message_stop"],
"no block stops in end events (both text and tool already closed inline)"
);
match &end_events[0].data {
AnthropicStreamEvent::ContentBlockStop { index } => assert_eq!(*index, 1),
other => panic!("expected tool stop at index 1, got {other:?}"),
}
}
/// Tool-only response (no preceding text): no spurious stop events.
......@@ -840,13 +871,19 @@ mod tests {
));
assert_eq!(
event_types(&tool_events),
vec!["content_block_start", "content_block_delta"]
vec![
"content_block_start",
"content_block_delta",
"content_block_stop"
],
"complete tool call emits stop inline"
);
let end_events = conv.emit_end_events_tagged();
assert_eq!(
event_types(&end_events),
vec!["content_block_stop", "message_delta", "message_stop"]
vec!["message_delta", "message_stop"],
"no block stop in end events (already stopped inline)"
);
}
......@@ -937,7 +974,9 @@ mod tests {
AnthropicStreamEvent::ContentBlockStart { index: 1, .. }
));
// 3. Tool call → text block closes, tool block opens at index 2
// 3. Tool call → text block closes, tool block opens at index 2.
// Because the tool call arrives complete (id + name + args in one
// chunk), inline dispatch also emits content_block_stop immediately.
let ev = conv.process_chunk_tagged(&tool_call_chunk(
0,
Some("call-1"),
......@@ -949,7 +988,8 @@ mod tests {
vec![
"content_block_stop",
"content_block_start",
"content_block_delta"
"content_block_delta",
"content_block_stop"
]
);
assert!(matches!(
......@@ -979,4 +1019,50 @@ mod tests {
]
);
}
/// Multiple tool calls: each gets inline content_block_stop.
#[test]
fn test_multiple_tool_calls_each_stopped_inline() {
let mut conv = AnthropicStreamConverter::new("test-model".into());
let events1 = conv.process_chunk_tagged(&tool_call_chunk(
0,
Some("call-1"),
Some("Read"),
Some("{\"path\":\"/tmp/a.txt\"}"),
));
assert_eq!(
event_types(&events1),
vec![
"content_block_start",
"content_block_delta",
"content_block_stop"
],
"first tool call closed inline"
);
let events2 = conv.process_chunk_tagged(&tool_call_chunk(
1,
Some("call-2"),
Some("Write"),
Some("{\"path\":\"/tmp/b.txt\"}"),
));
assert_eq!(
event_types(&events2),
vec![
"content_block_start",
"content_block_delta",
"content_block_stop"
],
"second tool call closed inline"
);
// End events: no block stops (both already closed)
let end_events = conv.emit_end_events_tagged();
assert_eq!(
event_types(&end_events),
vec!["message_delta", "message_stop"],
"no block stops in end events"
);
}
}
......@@ -56,6 +56,9 @@ struct FunctionCallState {
accumulated_args: String,
output_index: u32,
started: bool,
/// Set when done/item_done events have already been emitted inline
/// (complete tool call detected mid-stream). Prevents duplicate in `emit_end_events()`.
done: bool,
}
impl ResponseStreamConverter {
......@@ -284,6 +287,7 @@ impl ResponseStreamConverter {
accumulated_args: String::new(),
output_index,
started: false,
done: false,
});
}
......@@ -323,19 +327,67 @@ impl ResponseStreamConverter {
self.function_call_items[tc_index]
.accumulated_args
.push_str(args);
let item_id = self.function_call_items[tc_index].item_id.clone();
let output_index = self.function_call_items[tc_index].output_index;
let is_complete = tc.id.is_some()
&& func.name.is_some()
&& !self.function_call_items[tc_index].done;
// Clone item_id once; reused by both args_delta and (if complete) done events.
let item_id = self.function_call_items[tc_index].item_id.clone();
let seq = self.next_seq();
let args_delta =
ResponseStreamEvent::ResponseFunctionCallArgumentsDelta(
ResponseFunctionCallArgumentsDeltaEvent {
sequence_number: seq,
item_id,
item_id: item_id.clone(),
output_index,
delta: args.clone(),
},
);
events.push(make_sse_event(&args_delta));
// Emit done + output_item.done immediately if the tool call
// arrived complete in a single chunk (id + name + args all present).
// Dynamo backends emit complete tool calls, so this fires on the
// same chunk — no need to wait for finish_reason.
if is_complete {
self.function_call_items[tc_index].done = true;
// Reuse item_id from above; capture remaining values before self.next_seq()
let fc_item_id = item_id;
let fc_call_id = self.function_call_items[tc_index].call_id.clone();
let fc_name = self.function_call_items[tc_index].name.clone();
let fc_args =
self.function_call_items[tc_index].accumulated_args.clone();
let fc_output_index =
self.function_call_items[tc_index].output_index;
let args_done =
ResponseStreamEvent::ResponseFunctionCallArgumentsDone(
ResponseFunctionCallArgumentsDoneEvent {
sequence_number: self.next_seq(),
item_id: fc_item_id.clone(),
output_index: fc_output_index,
arguments: fc_args.clone(),
name: Some(fc_name.clone()),
},
);
events.push(make_sse_event(&args_done));
let item_done = ResponseStreamEvent::ResponseOutputItemDone(
ResponseOutputItemDoneEvent {
sequence_number: self.next_seq(),
output_index: fc_output_index,
item: OutputItem::FunctionCall(FunctionToolCall {
id: Some(fc_item_id),
call_id: fc_call_id,
name: fc_name,
arguments: fc_args,
status: Some(OutputStatus::Completed),
}),
},
);
events.push(make_sse_event(&item_done));
}
}
}
}
......@@ -393,11 +445,11 @@ impl ResponseStreamConverter {
events.push(make_sse_event(&item_done));
}
// Close any function call items - collect data first to avoid borrow conflicts
// Close any function call items not already done inline
let fc_data: Vec<_> = self
.function_call_items
.iter()
.filter(|fc| fc.started)
.filter(|fc| fc.started && !fc.done)
.map(|fc| {
(
fc.item_id.clone(),
......@@ -598,3 +650,262 @@ fn get_event_type(event: &ResponseStreamEvent) -> &'static str {
ResponseStreamEvent::ResponseError(_) => "error",
}
}
#[cfg(test)]
mod tests {
use super::*;
use dynamo_async_openai::types::{
ChatChoiceStream, ChatCompletionMessageContent, ChatCompletionMessageToolCallChunk,
ChatCompletionStreamResponseDelta, ChatCompletionToolType, FunctionCallStream,
};
fn default_params() -> ResponseParams {
ResponseParams {
model: None,
temperature: None,
top_p: None,
max_output_tokens: None,
store: None,
tools: None,
tool_choice: None,
instructions: None,
reasoning: None,
text: None,
service_tier: None,
include: None,
truncation: None,
}
}
fn tool_call_chunk(
tc_index: u32,
id: Option<&str>,
name: Option<&str>,
args: Option<&str>,
) -> NvCreateChatCompletionStreamResponse {
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: None,
function_call: None,
tool_calls: Some(vec![ChatCompletionMessageToolCallChunk {
index: tc_index,
id: id.map(String::from),
r#type: Some(ChatCompletionToolType::Function),
function: Some(FunctionCallStream {
name: name.map(String::from),
arguments: args.map(String::from),
}),
}]),
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
nvext: None,
}
}
fn text_chunk(text: &str) -> NvCreateChatCompletionStreamResponse {
#[allow(deprecated)]
NvCreateChatCompletionStreamResponse {
id: "chat-1".into(),
choices: vec![ChatChoiceStream {
index: 0,
delta: ChatCompletionStreamResponseDelta {
content: Some(ChatCompletionMessageContent::Text(text.into())),
function_call: None,
tool_calls: None,
role: None,
refusal: None,
reasoning_content: None,
},
finish_reason: None,
stop_reason: None,
logprobs: None,
}],
created: 0,
model: "test".into(),
service_tier: None,
system_fingerprint: None,
object: "chat.completion.chunk".into(),
usage: None,
nvext: None,
}
}
/// Extract the SSE event type from a Result<Event, _>.
fn event_type(event: &Result<Event, anyhow::Error>) -> String {
let debug = format!("{:?}", event.as_ref().unwrap());
// Event debug format: Event { ... event: "response.xxx" ... }
// Parse the event type from the serialized SSE data
if let Some(start) = debug.find("event: ") {
let rest = &debug[start + 7..];
if let Some(end) = rest.find("\\n") {
return rest[..end].to_string();
}
}
"unknown".to_string()
}
fn event_types(events: &[Result<Event, anyhow::Error>]) -> Vec<String> {
events.iter().map(event_type).collect()
}
/// Complete tool call emits function_call_arguments.done + output_item.done inline.
#[test]
fn test_complete_tool_call_emits_done_inline() {
let mut conv = ResponseStreamConverter::new("test-model".into(), default_params());
let _ = conv.emit_start_events(); // consume start events
let events = conv.process_chunk(&tool_call_chunk(
0,
Some("call-1"),
Some("get_weather"),
Some("{\"city\":\"SF\"}"),
));
let types = event_types(&events);
assert!(
types.contains(&"response.output_item.added".to_string()),
"should emit output_item.added: {types:?}"
);
assert!(
types.contains(&"response.function_call_arguments.delta".to_string()),
"should emit args delta: {types:?}"
);
assert!(
types.contains(&"response.function_call_arguments.done".to_string()),
"should emit args done inline: {types:?}"
);
assert!(
types.contains(&"response.output_item.done".to_string()),
"should emit output_item.done inline: {types:?}"
);
// End events should NOT duplicate the done events
let end_types = event_types(&conv.emit_end_events());
assert!(
!end_types.contains(&"response.function_call_arguments.done".to_string()),
"done should not be duplicated in end events: {end_types:?}"
);
assert!(
!end_types.contains(&"response.output_item.done".to_string())
|| end_types
.iter()
.filter(|t| *t == "response.output_item.done")
.count()
== 0,
"output_item.done for the tool should not appear in end events"
);
}
/// Multiple tool calls each get their own inline done events.
#[test]
fn test_multiple_tool_calls_each_emit_done_inline() {
let mut conv = ResponseStreamConverter::new("test-model".into(), default_params());
let _ = conv.emit_start_events();
let events1 = conv.process_chunk(&tool_call_chunk(
0,
Some("call-1"),
Some("get_weather"),
Some("{\"city\":\"SF\"}"),
));
let types1 = event_types(&events1);
assert!(
types1.contains(&"response.function_call_arguments.done".to_string()),
"first tool call done inline: {types1:?}"
);
let events2 = conv.process_chunk(&tool_call_chunk(
1,
Some("call-2"),
Some("get_time"),
Some("{\"tz\":\"PST\"}"),
));
let types2 = event_types(&events2);
assert!(
types2.contains(&"response.function_call_arguments.done".to_string()),
"second tool call done inline: {types2:?}"
);
// End events should have no function call done events
let end_types = event_types(&conv.emit_end_events());
let fc_done_count = end_types
.iter()
.filter(|t| *t == "response.function_call_arguments.done")
.count();
assert_eq!(
fc_done_count, 0,
"no function_call_arguments.done in end events: {end_types:?}"
);
}
/// Text-only response: no tool-related events at all.
#[test]
fn test_text_only_response_no_tool_events() {
let mut conv = ResponseStreamConverter::new("test-model".into(), default_params());
let _ = conv.emit_start_events();
let events = conv.process_chunk(&text_chunk("Hello world"));
let types = event_types(&events);
assert!(
!types.contains(&"response.function_call_arguments.done".to_string()),
"no tool events in text-only: {types:?}"
);
let end_events = conv.emit_end_events();
let end_types = event_types(&end_events);
assert!(
end_types.contains(&"response.output_text.done".to_string()),
"text done in end events: {end_types:?}"
);
assert!(
end_types.contains(&"response.completed".to_string()),
"completed in end events: {end_types:?}"
);
}
/// Text followed by tool call: both handled correctly.
#[test]
fn test_text_then_tool_call() {
let mut conv = ResponseStreamConverter::new("test-model".into(), default_params());
let _ = conv.emit_start_events();
let text_events = conv.process_chunk(&text_chunk("Let me check that."));
let text_types = event_types(&text_events);
assert!(
text_types.contains(&"response.output_item.added".to_string()),
"text message started: {text_types:?}"
);
let tool_events = conv.process_chunk(&tool_call_chunk(
0,
Some("call-1"),
Some("search"),
Some("{\"q\":\"rust\"}"),
));
let tool_types = event_types(&tool_events);
assert!(
tool_types.contains(&"response.function_call_arguments.done".to_string()),
"tool call done inline after text: {tool_types:?}"
);
assert!(
tool_types.contains(&"response.output_item.done".to_string()),
"output_item.done inline after text: {tool_types:?}"
);
}
}
......@@ -285,6 +285,13 @@ pub mod llm {
/// varies per session and per release, wasting tokens and breaking prompt caching.
pub const DYN_STRIP_ANTHROPIC_PREAMBLE: &str = "DYN_STRIP_ANTHROPIC_PREAMBLE";
/// Enable streaming tool call dispatch (`event: tool_call_dispatch` SSE events)
pub const DYN_ENABLE_STREAMING_TOOL_DISPATCH: &str = "DYN_ENABLE_STREAMING_TOOL_DISPATCH";
/// Enable streaming reasoning dispatch (`event: reasoning_dispatch` SSE events)
pub const DYN_ENABLE_STREAMING_REASONING_DISPATCH: &str =
"DYN_ENABLE_STREAMING_REASONING_DISPATCH";
/// Metrics configuration
pub mod metrics {
/// Custom metrics prefix (overrides default "dynamo_frontend")
......@@ -464,6 +471,8 @@ mod tests {
llm::DYN_LORA_PATH,
llm::DYN_ENABLE_ANTHROPIC_API,
llm::DYN_STRIP_ANTHROPIC_PREAMBLE,
llm::DYN_ENABLE_STREAMING_TOOL_DISPATCH,
llm::DYN_ENABLE_STREAMING_REASONING_DISPATCH,
llm::metrics::DYN_METRICS_PREFIX,
// Model
model::model_express::MODEL_EXPRESS_URL,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment