Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
70f6309c
Unverified
Commit
70f6309c
authored
Oct 21, 2025
by
Chang Su
Committed by
GitHub
Oct 21, 2025
Browse files
[router][grpc] Support `v1/responses` API (#11926)
parent
70416001
Changes
17
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
3611 additions
and
29 deletions
+3611
-29
python/sglang/srt/entrypoints/openai/serving_responses.py
python/sglang/srt/entrypoints/openai/serving_responses.py
+3
-1
sgl-router/src/data_connector/conversations.rs
sgl-router/src/data_connector/conversations.rs
+4
-1
sgl-router/src/mcp/client_manager.rs
sgl-router/src/mcp/client_manager.rs
+31
-3
sgl-router/src/protocols/responses.rs
sgl-router/src/protocols/responses.rs
+13
-2
sgl-router/src/routers/grpc/mod.rs
sgl-router/src/routers/grpc/mod.rs
+1
-0
sgl-router/src/routers/grpc/pipeline.rs
sgl-router/src/routers/grpc/pipeline.rs
+92
-3
sgl-router/src/routers/grpc/processing.rs
sgl-router/src/routers/grpc/processing.rs
+1
-4
sgl-router/src/routers/grpc/responses/conversions.rs
sgl-router/src/routers/grpc/responses/conversions.rs
+365
-0
sgl-router/src/routers/grpc/responses/handlers.rs
sgl-router/src/routers/grpc/responses/handlers.rs
+1290
-0
sgl-router/src/routers/grpc/responses/mod.rs
sgl-router/src/routers/grpc/responses/mod.rs
+20
-0
sgl-router/src/routers/grpc/responses/streaming.rs
sgl-router/src/routers/grpc/responses/streaming.rs
+574
-0
sgl-router/src/routers/grpc/responses/tool_loop.rs
sgl-router/src/routers/grpc/responses/tool_loop.rs
+1114
-0
sgl-router/src/routers/grpc/responses/types.rs
sgl-router/src/routers/grpc/responses/types.rs
+18
-0
sgl-router/src/routers/grpc/router.rs
sgl-router/src/routers/grpc/router.rs
+77
-10
sgl-router/src/routers/openai/conversations.rs
sgl-router/src/routers/openai/conversations.rs
+5
-2
sgl-router/src/routers/openai/mcp.rs
sgl-router/src/routers/openai/mcp.rs
+1
-1
sgl-router/src/routers/openai/mod.rs
sgl-router/src/routers/openai/mod.rs
+2
-2
No files found.
python/sglang/srt/entrypoints/openai/serving_responses.py
View file @
70f6309c
...
...
@@ -778,7 +778,9 @@ class OpenAIServingResponses(OpenAIServingChat):
# Update the status to "cancelled"
response
.
status
=
"cancelled"
# Abort the request
# The response_id is the same as the rid used when submitting the request
self
.
tokenizer_manager
.
abort_request
(
rid
=
response_id
)
if
task
:
=
self
.
background_tasks
.
get
(
response_id
):
task
.
cancel
()
try
:
...
...
sgl-router/src/data_connector/conversations.rs
View file @
70f6309c
...
...
@@ -52,6 +52,9 @@ pub type ConversationMetadata = JsonMap<String, Value>;
/// Input payload for creating a conversation
#[derive(Debug,
Clone,
Serialize,
Deserialize,
Default)]
pub
struct
NewConversation
{
/// Optional conversation ID (if None, a random ID will be generated)
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
id
:
Option
<
ConversationId
>
,
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
metadata
:
Option
<
ConversationMetadata
>
,
}
...
...
@@ -68,7 +71,7 @@ pub struct Conversation {
impl
Conversation
{
pub
fn
new
(
new_conversation
:
NewConversation
)
->
Self
{
Self
{
id
:
C
onversation
Id
::
new
(),
id
:
new_c
onversation
.id
.unwrap_or_default
(),
created_at
:
Utc
::
now
(),
metadata
:
new_conversation
.metadata
,
}
...
...
sgl-router/src/mcp/client_manager.rs
View file @
70f6309c
...
...
@@ -180,21 +180,49 @@ impl McpClientManager {
let
backoff
=
ExponentialBackoffBuilder
::
new
()
.with_initial_interval
(
Duration
::
from_secs
(
1
))
.with_max_interval
(
Duration
::
from_secs
(
30
))
.with_max_elapsed_time
(
Some
(
Duration
::
from_secs
(
12
0
)))
.with_max_elapsed_time
(
Some
(
Duration
::
from_secs
(
3
0
)))
.build
();
backoff
::
future
::
retry
(
backoff
,
||
async
{
match
Self
::
connect_server_impl
(
config
)
.await
{
Ok
(
client
)
=>
Ok
(
client
),
Err
(
e
)
=>
{
tracing
::
warn!
(
"Failed to connect to '{}', retrying: {}"
,
config
.name
,
e
);
Err
(
backoff
::
Error
::
transient
(
e
))
if
Self
::
is_permanent_error
(
&
e
)
{
tracing
::
error!
(
"Permanent error connecting to '{}': {} - not retrying"
,
config
.name
,
e
);
Err
(
backoff
::
Error
::
permanent
(
e
))
}
else
{
tracing
::
warn!
(
"Failed to connect to '{}', retrying: {}"
,
config
.name
,
e
);
Err
(
backoff
::
Error
::
transient
(
e
))
}
}
}
})
.await
}
/// Determine if an error is permanent (should not retry) or transient (should retry)
fn
is_permanent_error
(
error
:
&
McpError
)
->
bool
{
match
error
{
McpError
::
Config
(
_
)
=>
true
,
McpError
::
Auth
(
_
)
=>
true
,
McpError
::
ServerNotFound
(
_
)
=>
true
,
McpError
::
Transport
(
_
)
=>
true
,
McpError
::
ConnectionFailed
(
msg
)
=>
{
msg
.contains
(
"initialize"
)
||
msg
.contains
(
"connection closed"
)
||
msg
.contains
(
"connection refused"
)
||
msg
.contains
(
"invalid URL"
)
||
msg
.contains
(
"not found"
)
}
// Tool-related errors shouldn't occur during connection
_
=>
false
,
}
}
/// Internal implementation of server connection
async
fn
connect_server_impl
(
config
:
&
McpServerConfig
,
...
...
sgl-router/src/protocols/responses.rs
View file @
70f6309c
...
...
@@ -411,6 +411,14 @@ fn default_repetition_penalty() -> f32 {
1.0
}
fn
default_temperature
()
->
Option
<
f32
>
{
Some
(
1.0
)
}
fn
default_top_p
()
->
Option
<
f32
>
{
Some
(
1.0
)
}
// ============================================================================
// Request/Response Types
// ============================================================================
...
...
@@ -477,7 +485,10 @@ pub struct ResponsesRequest {
pub
stream
:
Option
<
bool
>
,
/// Temperature for sampling
#[serde(skip_serializing_if
=
"Option::is_none"
)]
#[serde(
default
=
"default_temperature"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
temperature
:
Option
<
f32
>
,
/// Tool choice behavior
...
...
@@ -493,7 +504,7 @@ pub struct ResponsesRequest {
pub
top_logprobs
:
Option
<
u32
>
,
/// Top-p sampling parameter
#[serde(skip_serializing_if
=
"Option::is_none"
)]
#[serde(
default
=
"default_top_p"
,
skip_serializing_if
=
"Option::is_none"
)]
pub
top_p
:
Option
<
f32
>
,
/// Truncation behavior
...
...
sgl-router/src/routers/grpc/mod.rs
View file @
70f6309c
...
...
@@ -6,6 +6,7 @@ pub mod context;
pub
mod
pd_router
;
pub
mod
pipeline
;
pub
mod
processing
;
pub
mod
responses
;
pub
mod
router
;
pub
mod
streaming
;
pub
mod
utils
;
...
...
sgl-router/src/routers/grpc/pipeline.rs
View file @
70f6309c
...
...
@@ -4,6 +4,8 @@
//! that transform a RequestContext through its lifecycle.
use
std
::{
borrow
::
Cow
,
collections
::
HashMap
,
sync
::
Arc
,
time
::{
Instant
,
SystemTime
,
UNIX_EPOCH
},
};
...
...
@@ -12,15 +14,20 @@ use async_trait::async_trait;
use
axum
::
response
::{
IntoResponse
,
Response
};
use
proto
::
DisaggregatedParams
;
use
rand
::
Rng
;
use
tokio
::
sync
::
RwLock
;
use
tracing
::{
debug
,
error
,
warn
};
use
uuid
::
Uuid
;
use
super
::{
context
::
*
,
processing
,
streaming
,
utils
};
use
super
::{
context
::
*
,
processing
,
responses
::
BackgroundTaskInfo
,
streaming
,
utils
};
use
crate
::{
core
::{
ConnectionMode
,
Worker
,
WorkerRegistry
,
WorkerType
},
grpc_client
::
proto
,
policies
::
PolicyRegistry
,
protocols
::{
chat
::
ChatCompletionRequest
,
common
::
InputIds
,
generate
::
GenerateRequest
},
protocols
::{
chat
::{
ChatCompletionRequest
,
ChatCompletionResponse
},
common
::
InputIds
,
generate
::
GenerateRequest
,
},
reasoning_parser
::
ParserFactory
as
ReasoningParserFactory
,
tokenizer
::
traits
::
Tokenizer
,
tool_parser
::
ParserFactory
as
ToolParserFactory
,
...
...
@@ -131,7 +138,7 @@ impl PreparationStage {
token_ids
,
processed_messages
:
Some
(
processed_messages
),
tool_constraints
:
tool_call_constraint
,
filtered_request
:
if
matches!
(
body_ref
,
std
::
borrow
::
Cow
::
Owned
(
_
))
{
filtered_request
:
if
matches!
(
body_ref
,
Cow
::
Owned
(
_
))
{
Some
(
body_ref
.into_owned
())
}
else
{
None
...
...
@@ -1090,4 +1097,86 @@ impl RequestPipeline {
None
=>
utils
::
internal_error_static
(
"No response produced"
),
}
}
/// Execute chat pipeline for responses endpoint (Result-based for easier composition)
///
/// This is used by the responses module and returns Result instead of Response.
/// It also supports background mode cancellation via background_tasks.
pub
async
fn
execute_chat_for_responses
(
&
self
,
request
:
Arc
<
ChatCompletionRequest
>
,
headers
:
Option
<
http
::
HeaderMap
>
,
model_id
:
Option
<
String
>
,
components
:
Arc
<
SharedComponents
>
,
response_id
:
Option
<
String
>
,
background_tasks
:
Option
<
Arc
<
RwLock
<
HashMap
<
String
,
BackgroundTaskInfo
>>>>
,
)
->
Result
<
ChatCompletionResponse
,
String
>
{
let
mut
ctx
=
RequestContext
::
for_chat
(
request
,
headers
,
model_id
,
components
);
// Execute each stage in sequence
for
(
idx
,
stage
)
in
self
.stages
.iter
()
.enumerate
()
{
match
stage
.execute
(
&
mut
ctx
)
.await
{
Ok
(
Some
(
_
response
))
=>
{
// Streaming not supported for responses sync mode
return
Err
(
"Streaming is not supported in this context"
.to_string
());
}
Ok
(
None
)
=>
{
let
stage_name
=
stage
.name
();
// After ClientAcquisitionStage, store client for background task cancellation
if
stage_name
==
"ClientAcquisition"
{
if
let
(
Some
(
ref
clients
),
Some
(
ref
resp_id
),
Some
(
ref
tasks
))
=
(
&
ctx
.state.clients
,
&
response_id
,
&
background_tasks
)
{
let
client_to_store
=
match
clients
{
ClientSelection
::
Single
{
client
}
=>
client
.clone
(),
ClientSelection
::
Dual
{
decode
,
..
}
=>
decode
.clone
(),
};
if
let
Some
(
task_info
)
=
tasks
.write
()
.await
.get_mut
(
resp_id
.as_str
())
{
*
task_info
.client
.write
()
.await
=
Some
(
client_to_store
);
debug!
(
"Stored client for response_id: {}"
,
resp_id
);
}
}
}
// After DispatchMetadataStage, store grpc_request_id for background task cancellation
if
stage_name
==
"DispatchMetadata"
{
if
let
(
Some
(
ref
dispatch
),
Some
(
ref
resp_id
),
Some
(
ref
tasks
))
=
(
&
ctx
.state.dispatch
,
&
response_id
,
&
background_tasks
)
{
let
grpc_request_id
=
dispatch
.request_id
.clone
();
if
let
Some
(
task_info
)
=
tasks
.write
()
.await
.get_mut
(
resp_id
.as_str
())
{
task_info
.grpc_request_id
=
grpc_request_id
.clone
();
debug!
(
"Stored grpc_request_id for response_id: {}"
,
resp_id
);
}
}
}
// Continue to next stage
continue
;
}
Err
(
response
)
=>
{
// Error occurred
error!
(
"Stage {} ({}) failed with status {}"
,
idx
+
1
,
stage
.name
(),
response
.status
()
);
return
Err
(
format!
(
"Pipeline stage {} failed"
,
stage
.name
()));
}
}
}
// Extract final response
match
ctx
.state.response.final_response
{
Some
(
FinalResponse
::
Chat
(
response
))
=>
Ok
(
response
),
Some
(
FinalResponse
::
Generate
(
_
))
=>
{
Err
(
"Internal error: wrong response type"
.to_string
())
}
None
=>
Err
(
"No response produced"
.to_string
()),
}
}
}
sgl-router/src/routers/grpc/processing.rs
View file @
70f6309c
...
...
@@ -408,10 +408,7 @@ impl ResponseProcessor {
tool_type
:
"function"
.to_string
(),
function
:
FunctionCallResponse
{
name
:
tc
.function.name
,
arguments
:
Some
(
serde_json
::
to_string
(
&
tc
.function.arguments
)
.unwrap_or_else
(|
_
|
"{}"
.to_string
()),
),
arguments
:
Some
(
tc
.function.arguments
),
},
}
})
...
...
sgl-router/src/routers/grpc/responses/conversions.rs
0 → 100644
View file @
70f6309c
//! Conversion utilities for translating between /v1/responses and /v1/chat/completions formats
//!
//! This module implements the conversion approach where:
//! 1. ResponsesRequest → ChatCompletionRequest (for backend processing)
//! 2. ChatCompletionResponse → ResponsesResponse (for client response)
//!
//! This allows the gRPC router to reuse the existing chat pipeline infrastructure
//! without requiring Python backend changes.
use
crate
::
protocols
::{
chat
::{
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatMessage
,
UserMessageContent
},
common
::{
FunctionCallResponse
,
StreamOptions
,
ToolCall
,
UsageInfo
},
responses
::{
ResponseContentPart
,
ResponseInput
,
ResponseInputOutputItem
,
ResponseOutputItem
,
ResponseStatus
,
ResponsesRequest
,
ResponsesResponse
,
ResponsesUsage
,
},
};
/// Convert a ResponsesRequest to ChatCompletionRequest for processing through the chat pipeline
///
/// # Conversion Logic
/// - `input` (text/items) → `messages` (chat messages)
/// - `instructions` → system message (prepended)
/// - `max_output_tokens` → `max_completion_tokens`
/// - Tool-related fields are passed through
/// - Response-specific fields (previous_response_id, conversation) are handled by router
pub
fn
responses_to_chat
(
req
:
&
ResponsesRequest
)
->
Result
<
ChatCompletionRequest
,
String
>
{
let
mut
messages
=
Vec
::
new
();
// 1. Add system message if instructions provided
if
let
Some
(
instructions
)
=
&
req
.instructions
{
messages
.push
(
ChatMessage
::
System
{
content
:
instructions
.clone
(),
name
:
None
,
});
}
// 2. Convert input to chat messages
match
&
req
.input
{
ResponseInput
::
Text
(
text
)
=>
{
// Simple text input → user message
messages
.push
(
ChatMessage
::
User
{
content
:
UserMessageContent
::
Text
(
text
.clone
()),
name
:
None
,
});
}
ResponseInput
::
Items
(
items
)
=>
{
// Structured items → convert each to appropriate chat message
for
item
in
items
{
match
item
{
ResponseInputOutputItem
::
Message
{
role
,
content
,
..
}
=>
{
// Extract text from content parts
let
text
=
extract_text_from_content
(
content
);
match
role
.as_str
()
{
"user"
=>
{
messages
.push
(
ChatMessage
::
User
{
content
:
UserMessageContent
::
Text
(
text
),
name
:
None
,
});
}
"assistant"
=>
{
messages
.push
(
ChatMessage
::
Assistant
{
content
:
Some
(
text
),
name
:
None
,
tool_calls
:
None
,
reasoning_content
:
None
,
});
}
"system"
=>
{
messages
.push
(
ChatMessage
::
System
{
content
:
text
,
name
:
None
,
});
}
_
=>
{
// Unknown role, treat as user message
messages
.push
(
ChatMessage
::
User
{
content
:
UserMessageContent
::
Text
(
text
),
name
:
None
,
});
}
}
}
ResponseInputOutputItem
::
FunctionToolCall
{
id
,
name
,
arguments
,
output
,
..
}
=>
{
// Tool call from history - add as assistant message with tool call
// followed by tool response if output exists
// Add assistant message with tool_calls (the LLM's decision)
messages
.push
(
ChatMessage
::
Assistant
{
content
:
None
,
name
:
None
,
tool_calls
:
Some
(
vec!
[
ToolCall
{
id
:
id
.clone
(),
tool_type
:
"function"
.to_string
(),
function
:
FunctionCallResponse
{
name
:
name
.clone
(),
arguments
:
Some
(
arguments
.clone
()),
},
}]),
reasoning_content
:
None
,
});
// Add tool result message if output exists
if
let
Some
(
output_text
)
=
output
{
messages
.push
(
ChatMessage
::
Tool
{
content
:
output_text
.clone
(),
tool_call_id
:
id
.clone
(),
});
}
}
ResponseInputOutputItem
::
Reasoning
{
content
,
..
}
=>
{
// Reasoning content - add as assistant message with reasoning_content
let
reasoning_text
=
content
.iter
()
.map
(|
c
|
match
c
{
crate
::
protocols
::
responses
::
ResponseReasoningContent
::
ReasoningText
{
text
}
=>
{
text
.as_str
()
}
})
.collect
::
<
Vec
<
_
>>
()
.join
(
"
\n
"
);
messages
.push
(
ChatMessage
::
Assistant
{
content
:
None
,
name
:
None
,
tool_calls
:
None
,
reasoning_content
:
Some
(
reasoning_text
),
});
}
}
}
}
}
// Ensure we have at least one message
if
messages
.is_empty
()
{
return
Err
(
"Request must contain at least one message"
.to_string
());
}
// 3. Build ChatCompletionRequest
let
is_streaming
=
req
.stream
.unwrap_or
(
false
);
Ok
(
ChatCompletionRequest
{
messages
,
model
:
req
.model
.clone
()
.unwrap_or_else
(||
"default"
.to_string
()),
temperature
:
req
.temperature
,
max_completion_tokens
:
req
.max_output_tokens
,
stream
:
is_streaming
,
stream_options
:
if
is_streaming
{
Some
(
StreamOptions
{
include_usage
:
Some
(
true
),
})
}
else
{
None
},
parallel_tool_calls
:
req
.parallel_tool_calls
,
top_logprobs
:
req
.top_logprobs
,
top_p
:
req
.top_p
,
skip_special_tokens
:
true
,
// Always skip special tokens // TODO: except for gpt-oss
// Note: tools and tool_choice will be handled separately for MCP transformation
tools
:
None
,
// Will be set by caller if needed
tool_choice
:
None
,
// Will be set by caller if needed
..
Default
::
default
()
})
}
/// Extract text content from ResponseContentPart array
fn
extract_text_from_content
(
content
:
&
[
ResponseContentPart
])
->
String
{
content
.iter
()
.filter_map
(|
part
|
match
part
{
ResponseContentPart
::
InputText
{
text
}
=>
Some
(
text
.as_str
()),
ResponseContentPart
::
OutputText
{
text
,
..
}
=>
Some
(
text
.as_str
()),
_
=>
None
,
})
.collect
::
<
Vec
<
_
>>
()
.join
(
""
)
}
/// Convert a ChatCompletionResponse to ResponsesResponse
///
/// # Conversion Logic
/// - `id` → `id` (pass through)
/// - `model` → `model` (pass through)
/// - `choices[0].message` → `output` array (convert to ResponseOutputItem::Message)
/// - `choices[0].finish_reason` → determines `status` (stop/length → Completed)
/// - `created` timestamp → `created_at`
pub
fn
chat_to_responses
(
chat_resp
:
&
ChatCompletionResponse
,
original_req
:
&
ResponsesRequest
,
)
->
Result
<
ResponsesResponse
,
String
>
{
// Extract the first choice (responses API doesn't support n>1)
let
choice
=
chat_resp
.choices
.first
()
.ok_or_else
(||
"Chat response contains no choices"
.to_string
())
?
;
// Convert assistant message to output items
let
mut
output
:
Vec
<
ResponseOutputItem
>
=
Vec
::
new
();
// Convert message content to output item
if
let
Some
(
content
)
=
&
choice
.message.content
{
if
!
content
.is_empty
()
{
output
.push
(
ResponseOutputItem
::
Message
{
id
:
format!
(
"msg_{}"
,
chat_resp
.id
),
role
:
"assistant"
.to_string
(),
content
:
vec!
[
ResponseContentPart
::
OutputText
{
text
:
content
.clone
(),
annotations
:
vec!
[],
logprobs
:
choice
.logprobs
.clone
(),
}],
status
:
"completed"
.to_string
(),
});
}
}
// Convert reasoning content if present (O1-style models)
if
let
Some
(
reasoning
)
=
&
choice
.message.reasoning_content
{
if
!
reasoning
.is_empty
()
{
output
.push
(
ResponseOutputItem
::
Reasoning
{
id
:
format!
(
"reasoning_{}"
,
chat_resp
.id
),
summary
:
vec!
[],
content
:
vec!
[
crate
::
protocols
::
responses
::
ResponseReasoningContent
::
ReasoningText
{
text
:
reasoning
.clone
(),
},
],
status
:
Some
(
"completed"
.to_string
()),
});
}
}
// Convert tool calls if present
if
let
Some
(
tool_calls
)
=
&
choice
.message.tool_calls
{
for
tool_call
in
tool_calls
{
output
.push
(
ResponseOutputItem
::
FunctionToolCall
{
id
:
tool_call
.id
.clone
(),
name
:
tool_call
.function.name
.clone
(),
arguments
:
tool_call
.function.arguments
.clone
()
.unwrap_or_default
(),
output
:
None
,
// Tool hasn't been executed yet
status
:
"in_progress"
.to_string
(),
});
}
}
// Determine response status based on finish_reason
let
status
=
match
choice
.finish_reason
.as_deref
()
{
Some
(
"stop"
)
|
Some
(
"length"
)
=>
ResponseStatus
::
Completed
,
Some
(
"tool_calls"
)
=>
ResponseStatus
::
InProgress
,
// Waiting for tool execution
Some
(
"failed"
)
|
Some
(
"error"
)
=>
ResponseStatus
::
Failed
,
_
=>
ResponseStatus
::
Completed
,
// Default to completed
};
// Convert usage from Usage to UsageInfo, then wrap in ResponsesUsage
let
usage
=
chat_resp
.usage
.as_ref
()
.map
(|
u
|
{
let
usage_info
=
UsageInfo
{
prompt_tokens
:
u
.prompt_tokens
,
completion_tokens
:
u
.completion_tokens
,
total_tokens
:
u
.total_tokens
,
reasoning_tokens
:
u
.completion_tokens_details
.as_ref
()
.and_then
(|
d
|
d
.reasoning_tokens
),
prompt_tokens_details
:
None
,
// Chat response doesn't have this
};
ResponsesUsage
::
Classic
(
usage_info
)
});
// Generate response
Ok
(
ResponsesResponse
{
id
:
chat_resp
.id
.clone
(),
object
:
"response"
.to_string
(),
created_at
:
chat_resp
.created
as
i64
,
status
,
error
:
None
,
incomplete_details
:
None
,
instructions
:
original_req
.instructions
.clone
(),
max_output_tokens
:
original_req
.max_output_tokens
,
model
:
chat_resp
.model
.clone
(),
output
,
parallel_tool_calls
:
original_req
.parallel_tool_calls
.unwrap_or
(
true
),
previous_response_id
:
original_req
.previous_response_id
.clone
(),
reasoning
:
None
,
// TODO: Map reasoning effort if needed
store
:
original_req
.store
.unwrap_or
(
true
),
temperature
:
original_req
.temperature
,
text
:
None
,
tool_choice
:
"auto"
.to_string
(),
// TODO: Map from original request
tools
:
original_req
.tools
.clone
()
.unwrap_or_default
(),
top_p
:
original_req
.top_p
,
truncation
:
None
,
usage
,
user
:
None
,
// No user field in chat response
metadata
:
original_req
.metadata
.clone
()
.unwrap_or_default
(),
})
}
#[cfg(test)]
mod
tests
{
use
super
::
*
;
#[test]
fn
test_text_input_conversion
()
{
let
req
=
ResponsesRequest
{
input
:
ResponseInput
::
Text
(
"Hello, world!"
.to_string
()),
instructions
:
Some
(
"You are a helpful assistant."
.to_string
()),
model
:
Some
(
"gpt-4"
.to_string
()),
temperature
:
Some
(
0.7
),
..
Default
::
default
()
};
let
chat_req
=
responses_to_chat
(
&
req
)
.unwrap
();
assert_eq!
(
chat_req
.messages
.len
(),
2
);
// system + user
assert_eq!
(
chat_req
.model
,
"gpt-4"
);
assert_eq!
(
chat_req
.temperature
,
Some
(
0.7
));
}
#[test]
fn
test_items_input_conversion
()
{
let
req
=
ResponsesRequest
{
input
:
ResponseInput
::
Items
(
vec!
[
ResponseInputOutputItem
::
Message
{
id
:
"msg_1"
.to_string
(),
role
:
"user"
.to_string
(),
content
:
vec!
[
ResponseContentPart
::
InputText
{
text
:
"Hello!"
.to_string
(),
}],
status
:
None
,
},
ResponseInputOutputItem
::
Message
{
id
:
"msg_2"
.to_string
(),
role
:
"assistant"
.to_string
(),
content
:
vec!
[
ResponseContentPart
::
OutputText
{
text
:
"Hi there!"
.to_string
(),
annotations
:
vec!
[],
logprobs
:
None
,
}],
status
:
None
,
},
]),
..
Default
::
default
()
};
let
chat_req
=
responses_to_chat
(
&
req
)
.unwrap
();
assert_eq!
(
chat_req
.messages
.len
(),
2
);
// user + assistant
}
#[test]
fn
test_empty_input_error
()
{
let
req
=
ResponsesRequest
{
input
:
ResponseInput
::
Text
(
""
.to_string
()),
..
Default
::
default
()
};
// Empty text should still create a user message, so this should succeed
let
result
=
responses_to_chat
(
&
req
);
assert
!
(
result
.is_ok
());
}
}
sgl-router/src/routers/grpc/responses/handlers.rs
0 → 100644
View file @
70f6309c
This diff is collapsed.
Click to expand it.
sgl-router/src/routers/grpc/responses/mod.rs
0 → 100644
View file @
70f6309c
//! gRPC Router `/v1/responses` endpoint implementation
//!
//! This module handles all responses-specific logic including:
//! - Request validation
//! - Conversation history and response chain loading
//! - Background mode execution
//! - Streaming support
//! - MCP tool loop wrapper
//! - Response persistence
// Module declarations
mod
conversions
;
mod
handlers
;
pub
mod
streaming
;
pub
mod
tool_loop
;
pub
mod
types
;
// Public exports
pub
use
handlers
::{
cancel_response_impl
,
get_response_impl
,
route_responses
};
pub
use
types
::
BackgroundTaskInfo
;
sgl-router/src/routers/grpc/responses/streaming.rs
0 → 100644
View file @
70f6309c
//! Streaming infrastructure for /v1/responses endpoint
use
std
::
collections
::
HashMap
;
use
bytes
::
Bytes
;
use
serde_json
::
json
;
use
tokio
::
sync
::
mpsc
;
use
uuid
::
Uuid
;
use
crate
::
protocols
::
chat
::
ChatCompletionStreamResponse
;
pub
(
super
)
enum
OutputItemType
{
Message
,
McpListTools
,
McpCall
,
Reasoning
,
}
/// Status of an output item
#[derive(Debug,
Clone,
PartialEq)]
enum
ItemStatus
{
InProgress
,
Completed
,
}
/// State tracking for a single output item
#[derive(Debug,
Clone)]
struct
OutputItemState
{
output_index
:
usize
,
status
:
ItemStatus
,
}
// ============================================================================
// Streaming Event Emitter
// ============================================================================
/// OpenAI-compatible event emitter for /v1/responses streaming
///
/// Manages state and sequence numbers to emit proper event types:
/// - response.created
/// - response.in_progress
/// - response.output_item.added
/// - response.content_part.added
/// - response.output_text.delta (multiple)
/// - response.output_text.done
/// - response.content_part.done
/// - response.output_item.done
/// - response.completed
/// - response.mcp_list_tools.in_progress
/// - response.mcp_list_tools.completed
/// - response.mcp_call.in_progress
/// - response.mcp_call_arguments.delta
/// - response.mcp_call_arguments.done
/// - response.mcp_call.completed
/// - response.mcp_call.failed
pub
(
super
)
struct
ResponseStreamEventEmitter
{
sequence_number
:
u64
,
response_id
:
String
,
model
:
String
,
created_at
:
u64
,
message_id
:
String
,
accumulated_text
:
String
,
has_emitted_created
:
bool
,
has_emitted_in_progress
:
bool
,
has_emitted_output_item_added
:
bool
,
has_emitted_content_part_added
:
bool
,
// MCP call tracking
mcp_call_accumulated_args
:
HashMap
<
String
,
String
>
,
// Output item tracking (NEW)
output_items
:
Vec
<
OutputItemState
>
,
next_output_index
:
usize
,
current_message_output_index
:
Option
<
usize
>
,
// Tracks output_index of current message
current_item_id
:
Option
<
String
>
,
// Tracks item_id of current item
}
impl
ResponseStreamEventEmitter
{
pub
(
super
)
fn
new
(
response_id
:
String
,
model
:
String
,
created_at
:
u64
)
->
Self
{
let
message_id
=
format!
(
"msg_{}"
,
Uuid
::
new_v4
());
Self
{
sequence_number
:
0
,
response_id
,
model
,
created_at
,
message_id
,
accumulated_text
:
String
::
new
(),
has_emitted_created
:
false
,
has_emitted_in_progress
:
false
,
has_emitted_output_item_added
:
false
,
has_emitted_content_part_added
:
false
,
mcp_call_accumulated_args
:
HashMap
::
new
(),
output_items
:
Vec
::
new
(),
next_output_index
:
0
,
current_message_output_index
:
None
,
current_item_id
:
None
,
}
}
fn
next_sequence
(
&
mut
self
)
->
u64
{
let
seq
=
self
.sequence_number
;
self
.sequence_number
+=
1
;
seq
}
pub
(
super
)
fn
emit_created
(
&
mut
self
)
->
serde_json
::
Value
{
self
.has_emitted_created
=
true
;
json!
({
"type"
:
"response.created"
,
"sequence_number"
:
self
.next_sequence
(),
"response"
:
{
"id"
:
self
.response_id
,
"object"
:
"response"
,
"created_at"
:
self
.created_at
,
"status"
:
"in_progress"
,
"model"
:
self
.model
,
"output"
:
[]
}
})
}
pub
(
super
)
fn
emit_in_progress
(
&
mut
self
)
->
serde_json
::
Value
{
self
.has_emitted_in_progress
=
true
;
json!
({
"type"
:
"response.in_progress"
,
"sequence_number"
:
self
.next_sequence
(),
"response"
:
{
"id"
:
self
.response_id
,
"object"
:
"response"
,
"status"
:
"in_progress"
}
})
}
pub
(
super
)
fn
emit_content_part_added
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
content_index
:
usize
,
)
->
serde_json
::
Value
{
self
.has_emitted_content_part_added
=
true
;
json!
({
"type"
:
"response.content_part.added"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"content_index"
:
content_index
,
"part"
:
{
"type"
:
"text"
,
"text"
:
""
}
})
}
pub
(
super
)
fn
emit_text_delta
(
&
mut
self
,
delta
:
&
str
,
output_index
:
usize
,
item_id
:
&
str
,
content_index
:
usize
,
)
->
serde_json
::
Value
{
self
.accumulated_text
.push_str
(
delta
);
json!
({
"type"
:
"response.output_text.delta"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"content_index"
:
content_index
,
"delta"
:
delta
})
}
pub
(
super
)
fn
emit_text_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
content_index
:
usize
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.output_text.done"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"content_index"
:
content_index
,
"text"
:
self
.accumulated_text
.clone
()
})
}
pub
(
super
)
fn
emit_content_part_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
content_index
:
usize
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.content_part.done"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"content_index"
:
content_index
,
"part"
:
{
"type"
:
"text"
,
"text"
:
self
.accumulated_text
.clone
()
}
})
}
pub
(
super
)
fn
emit_completed
(
&
mut
self
,
usage
:
Option
<&
serde_json
::
Value
>
,
)
->
serde_json
::
Value
{
let
mut
response
=
json!
({
"type"
:
"response.completed"
,
"sequence_number"
:
self
.next_sequence
(),
"response"
:
{
"id"
:
self
.response_id
,
"object"
:
"response"
,
"created_at"
:
self
.created_at
,
"status"
:
"completed"
,
"model"
:
self
.model
,
"output"
:
[{
"id"
:
self
.message_id
.clone
(),
"type"
:
"message"
,
"role"
:
"assistant"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
self
.accumulated_text
.clone
()
}]
}]
}
});
if
let
Some
(
usage_val
)
=
usage
{
response
[
"response"
][
"usage"
]
=
usage_val
.clone
();
}
response
}
// ========================================================================
// MCP Event Emission Methods
// ========================================================================
pub
(
super
)
fn
emit_mcp_list_tools_in_progress
(
&
mut
self
,
output_index
:
usize
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_list_tools.in_progress"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
})
}
pub
(
super
)
fn
emit_mcp_list_tools_completed
(
&
mut
self
,
output_index
:
usize
,
tools
:
&
[
crate
::
mcp
::
ToolInfo
],
)
->
serde_json
::
Value
{
let
tool_items
:
Vec
<
_
>
=
tools
.iter
()
.map
(|
t
|
{
json!
({
"name"
:
t
.name
,
"description"
:
t
.description
,
"input_schema"
:
t
.parameters
.clone
()
.unwrap_or_else
(||
json!
({
"type"
:
"object"
,
"properties"
:
{},
"required"
:
[]
}))
})
})
.collect
();
json!
({
"type"
:
"response.mcp_list_tools.completed"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"tools"
:
tool_items
})
}
pub
(
super
)
fn
emit_mcp_call_in_progress
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_call.in_progress"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
})
}
pub
(
super
)
fn
emit_mcp_call_arguments_delta
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
delta
:
&
str
,
)
->
serde_json
::
Value
{
// Accumulate arguments for this call
self
.mcp_call_accumulated_args
.entry
(
item_id
.to_string
())
.or_default
()
.push_str
(
delta
);
json!
({
"type"
:
"response.mcp_call_arguments.delta"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"delta"
:
delta
})
}
pub
(
super
)
fn
emit_mcp_call_arguments_done
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
arguments
:
&
str
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_call_arguments.done"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"arguments"
:
arguments
})
}
pub
(
super
)
fn
emit_mcp_call_completed
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_call.completed"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
})
}
pub
(
super
)
fn
emit_mcp_call_failed
(
&
mut
self
,
output_index
:
usize
,
item_id
:
&
str
,
error
:
&
str
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.mcp_call.failed"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item_id"
:
item_id
,
"error"
:
error
})
}
// ========================================================================
// Output Item Wrapper Events
// ========================================================================
/// Emit response.output_item.added event
pub
(
super
)
fn
emit_output_item_added
(
&
mut
self
,
output_index
:
usize
,
item
:
&
serde_json
::
Value
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.output_item.added"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item"
:
item
})
}
/// Emit response.output_item.done event
pub
(
super
)
fn
emit_output_item_done
(
&
mut
self
,
output_index
:
usize
,
item
:
&
serde_json
::
Value
,
)
->
serde_json
::
Value
{
json!
({
"type"
:
"response.output_item.done"
,
"sequence_number"
:
self
.next_sequence
(),
"output_index"
:
output_index
,
"item"
:
item
})
}
/// Generate unique ID for item type
fn
generate_item_id
(
prefix
:
&
str
)
->
String
{
format!
(
"{}_{}"
,
prefix
,
Uuid
::
new_v4
()
.to_string
()
.replace
(
"-"
,
""
))
}
/// Allocate next output index and track item
pub
(
super
)
fn
allocate_output_index
(
&
mut
self
,
item_type
:
OutputItemType
)
->
(
usize
,
String
)
{
let
index
=
self
.next_output_index
;
self
.next_output_index
+=
1
;
let
id_prefix
=
match
&
item_type
{
OutputItemType
::
McpListTools
=>
"mcpl"
,
OutputItemType
::
McpCall
=>
"mcp"
,
OutputItemType
::
Message
=>
"msg"
,
OutputItemType
::
Reasoning
=>
"rs"
,
};
let
id
=
Self
::
generate_item_id
(
id_prefix
);
self
.output_items
.push
(
OutputItemState
{
output_index
:
index
,
status
:
ItemStatus
::
InProgress
,
});
(
index
,
id
)
}
/// Mark output item as completed
pub
(
super
)
fn
complete_output_item
(
&
mut
self
,
output_index
:
usize
)
{
if
let
Some
(
item
)
=
self
.output_items
.iter_mut
()
.find
(|
i
|
i
.output_index
==
output_index
)
{
item
.status
=
ItemStatus
::
Completed
;
}
}
/// Emit reasoning item wrapper events (added + done)
///
/// Reasoning items in OpenAI format are simple placeholders emitted between tool iterations.
/// They don't have streaming content - just wrapper events with empty/null content.
pub
(
super
)
fn
emit_reasoning_item
(
&
mut
self
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
reasoning_content
:
Option
<
String
>
,
)
->
Result
<
(),
String
>
{
// Allocate output index and generate ID
let
(
output_index
,
item_id
)
=
self
.allocate_output_index
(
OutputItemType
::
Reasoning
);
// Build reasoning item structure
let
item
=
json!
({
"id"
:
item_id
,
"type"
:
"reasoning"
,
"summary"
:
[],
"content"
:
reasoning_content
,
"encrypted_content"
:
null
,
"status"
:
null
});
// Emit output_item.added
let
added_event
=
self
.emit_output_item_added
(
output_index
,
&
item
);
self
.send_event
(
&
added_event
,
tx
)
?
;
// Immediately emit output_item.done (no streaming for reasoning)
let
done_event
=
self
.emit_output_item_done
(
output_index
,
&
item
);
self
.send_event
(
&
done_event
,
tx
)
?
;
// Mark as completed
self
.complete_output_item
(
output_index
);
Ok
(())
}
/// Process a chunk and emit appropriate events
pub
(
super
)
fn
process_chunk
(
&
mut
self
,
chunk
:
&
ChatCompletionStreamResponse
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
)
->
Result
<
(),
String
>
{
// Process content if present
if
let
Some
(
choice
)
=
chunk
.choices
.first
()
{
if
let
Some
(
content
)
=
&
choice
.delta.content
{
if
!
content
.is_empty
()
{
// Allocate output_index and item_id for this message item (once per message)
if
self
.current_item_id
.is_none
()
{
let
(
output_index
,
item_id
)
=
self
.allocate_output_index
(
OutputItemType
::
Message
);
// Build message item structure
let
item
=
json!
({
"id"
:
item_id
,
"type"
:
"message"
,
"role"
:
"assistant"
,
"content"
:
[]
});
// Emit output_item.added
let
event
=
self
.emit_output_item_added
(
output_index
,
&
item
);
self
.send_event
(
&
event
,
tx
)
?
;
self
.has_emitted_output_item_added
=
true
;
// Store for subsequent events
self
.current_item_id
=
Some
(
item_id
);
self
.current_message_output_index
=
Some
(
output_index
);
}
let
output_index
=
self
.current_message_output_index
.unwrap
();
let
item_id
=
self
.current_item_id
.clone
()
.unwrap
();
// Clone to avoid borrow checker issues
let
content_index
=
0
;
// Single content part for now
// Emit content_part.added before first delta
if
!
self
.has_emitted_content_part_added
{
let
event
=
self
.emit_content_part_added
(
output_index
,
&
item_id
,
content_index
);
self
.send_event
(
&
event
,
tx
)
?
;
self
.has_emitted_content_part_added
=
true
;
}
// Emit text delta
let
event
=
self
.emit_text_delta
(
content
,
output_index
,
&
item_id
,
content_index
);
self
.send_event
(
&
event
,
tx
)
?
;
}
}
// Check for finish_reason to emit completion events
if
let
Some
(
reason
)
=
&
choice
.finish_reason
{
if
reason
==
"stop"
||
reason
==
"length"
{
let
output_index
=
self
.current_message_output_index
.unwrap
();
let
item_id
=
self
.current_item_id
.clone
()
.unwrap
();
// Clone to avoid borrow checker issues
let
content_index
=
0
;
// Emit closing events
if
self
.has_emitted_content_part_added
{
let
event
=
self
.emit_text_done
(
output_index
,
&
item_id
,
content_index
);
self
.send_event
(
&
event
,
tx
)
?
;
let
event
=
self
.emit_content_part_done
(
output_index
,
&
item_id
,
content_index
);
self
.send_event
(
&
event
,
tx
)
?
;
}
if
self
.has_emitted_output_item_added
{
// Build complete message item for output_item.done
let
item
=
json!
({
"id"
:
item_id
,
"type"
:
"message"
,
"role"
:
"assistant"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
self
.accumulated_text
.clone
()
}]
});
let
event
=
self
.emit_output_item_done
(
output_index
,
&
item
);
self
.send_event
(
&
event
,
tx
)
?
;
}
// Mark item as completed
self
.complete_output_item
(
output_index
);
}
}
}
Ok
(())
}
pub
(
super
)
fn
send_event
(
&
self
,
event
:
&
serde_json
::
Value
,
tx
:
&
mpsc
::
UnboundedSender
<
Result
<
Bytes
,
std
::
io
::
Error
>>
,
)
->
Result
<
(),
String
>
{
let
event_json
=
serde_json
::
to_string
(
event
)
.map_err
(|
e
|
format!
(
"Failed to serialize event: {}"
,
e
))
?
;
if
tx
.send
(
Ok
(
Bytes
::
from
(
format!
(
"data: {}
\n\n
"
,
event_json
))))
.is_err
()
{
return
Err
(
"Client disconnected"
.to_string
());
}
Ok
(())
}
}
sgl-router/src/routers/grpc/responses/tool_loop.rs
0 → 100644
View file @
70f6309c
This diff is collapsed.
Click to expand it.
sgl-router/src/routers/grpc/responses/types.rs
0 → 100644
View file @
70f6309c
//! Type definitions for /v1/responses endpoint
use
std
::
sync
::
Arc
;
use
tokio
::{
sync
::
RwLock
,
task
::
JoinHandle
};
/// Information stored for background tasks to enable end-to-end cancellation
///
/// This struct enables cancelling both the Rust task AND the Python scheduler processing.
/// The client field is lazily initialized during pipeline execution.
pub
struct
BackgroundTaskInfo
{
/// Tokio task handle for aborting the Rust task
pub
handle
:
JoinHandle
<
()
>
,
/// gRPC request_id sent to Python scheduler (chatcmpl-* prefix)
pub
grpc_request_id
:
String
,
/// gRPC client for sending abort requests to Python (set after client acquisition)
pub
client
:
Arc
<
RwLock
<
Option
<
crate
::
grpc_client
::
SglangSchedulerClient
>>>
,
}
sgl-router/src/routers/grpc/router.rs
View file @
70f6309c
// gRPC Router Implementation
use
std
::
sync
::
Arc
;
use
std
::
{
collections
::
HashMap
,
sync
::
Arc
}
;
use
async_trait
::
async_trait
;
use
axum
::{
...
...
@@ -9,12 +9,20 @@ use axum::{
http
::{
HeaderMap
,
StatusCode
},
response
::{
IntoResponse
,
Response
},
};
use
tokio
::
sync
::
RwLock
;
use
tracing
::
debug
;
use
super
::{
context
::
SharedComponents
,
pipeline
::
RequestPipeline
};
use
super
::{
context
::
SharedComponents
,
pipeline
::
RequestPipeline
,
responses
::{
self
,
BackgroundTaskInfo
},
};
use
crate
::{
config
::
types
::
RetryConfig
,
core
::
WorkerRegistry
,
data_connector
::{
SharedConversationItemStorage
,
SharedConversationStorage
,
SharedResponseStorage
,
},
policies
::
PolicyRegistry
,
protocols
::{
chat
::
ChatCompletionRequest
,
...
...
@@ -48,6 +56,14 @@ pub struct GrpcRouter {
configured_tool_parser
:
Option
<
String
>
,
pipeline
:
RequestPipeline
,
shared_components
:
Arc
<
SharedComponents
>
,
// Storage backends for /v1/responses support
response_storage
:
SharedResponseStorage
,
conversation_storage
:
SharedConversationStorage
,
conversation_item_storage
:
SharedConversationItemStorage
,
// Optional MCP manager for tool execution (enabled via SGLANG_MCP_CONFIG env var)
mcp_manager
:
Option
<
Arc
<
crate
::
mcp
::
McpClientManager
>>
,
// Background task handles for cancellation support (includes gRPC client for Python abort)
background_tasks
:
Arc
<
RwLock
<
HashMap
<
String
,
BackgroundTaskInfo
>>>
,
}
impl
GrpcRouter
{
...
...
@@ -73,6 +89,31 @@ impl GrpcRouter {
let
worker_registry
=
ctx
.worker_registry
.clone
();
let
policy_registry
=
ctx
.policy_registry
.clone
();
// Extract storage backends from context
let
response_storage
=
ctx
.response_storage
.clone
();
let
conversation_storage
=
ctx
.conversation_storage
.clone
();
let
conversation_item_storage
=
ctx
.conversation_item_storage
.clone
();
// Optional MCP manager activation via env var path (config-driven gate)
let
mcp_manager
=
match
std
::
env
::
var
(
"SGLANG_MCP_CONFIG"
)
.ok
()
{
Some
(
path
)
if
!
path
.trim
()
.is_empty
()
=>
{
match
crate
::
mcp
::
McpConfig
::
from_file
(
&
path
)
.await
{
Ok
(
cfg
)
=>
match
crate
::
mcp
::
McpClientManager
::
new
(
cfg
)
.await
{
Ok
(
mgr
)
=>
Some
(
Arc
::
new
(
mgr
)),
Err
(
err
)
=>
{
tracing
::
warn!
(
"Failed to initialize MCP manager: {}"
,
err
);
None
}
},
Err
(
err
)
=>
{
tracing
::
warn!
(
"Failed to load MCP config from '{}': {}"
,
path
,
err
);
None
}
}
}
_
=>
None
,
};
// Create shared components for pipeline
let
shared_components
=
Arc
::
new
(
SharedComponents
{
tokenizer
:
tokenizer
.clone
(),
...
...
@@ -104,6 +145,11 @@ impl GrpcRouter {
configured_tool_parser
:
ctx
.configured_tool_parser
.clone
(),
pipeline
,
shared_components
,
response_storage
,
conversation_storage
,
conversation_item_storage
,
mcp_manager
,
background_tasks
:
Arc
::
new
(
RwLock
::
new
(
HashMap
::
new
())),
})
}
...
...
@@ -217,24 +263,45 @@ impl RouterTrait for GrpcRouter {
async
fn
route_responses
(
&
self
,
_
headers
:
Option
<&
HeaderMap
>
,
_
body
:
&
ResponsesRequest
,
_
model_id
:
Option
<&
str
>
,
headers
:
Option
<&
HeaderMap
>
,
body
:
&
ResponsesRequest
,
model_id
:
Option
<&
str
>
,
)
->
Response
{
(
StatusCode
::
NOT_IMPLEMENTED
)
.into_response
()
// Use responses module for ALL requests (streaming and non-streaming)
// Responses module handles:
// - Request validation (previous_response_id XOR conversation)
// - Loading response chain / conversation history from storage
// - Conversion: ResponsesRequest → ChatCompletionRequest
// - Execution through chat pipeline stages
// - Conversion: ChatCompletionResponse → ResponsesResponse
// - Response persistence
// - MCP tool loop wrapper (future)
responses
::
route_responses
(
&
self
.pipeline
,
Arc
::
new
(
body
.clone
()),
headers
.cloned
(),
model_id
.map
(|
s
|
s
.to_string
()),
self
.shared_components
.clone
(),
self
.response_storage
.clone
(),
self
.conversation_storage
.clone
(),
self
.conversation_item_storage
.clone
(),
self
.background_tasks
.clone
(),
)
.await
}
async
fn
get_response
(
&
self
,
_
headers
:
Option
<&
HeaderMap
>
,
_
response_id
:
&
str
,
response_id
:
&
str
,
_
params
:
&
ResponsesGetParams
,
)
->
Response
{
(
StatusCode
::
NOT_IMPLEMENTED
)
.into_response
()
responses
::
get_response_impl
(
&
self
.response_storage
,
response_id
)
.await
}
async
fn
cancel_response
(
&
self
,
_
headers
:
Option
<&
HeaderMap
>
,
_
response_id
:
&
str
)
->
Response
{
(
StatusCode
::
NOT_IMPLEMENTED
)
.into_response
()
async
fn
cancel_response
(
&
self
,
_
headers
:
Option
<&
HeaderMap
>
,
response_id
:
&
str
)
->
Response
{
responses
::
cancel_response_impl
(
&
self
.response_storage
,
&
self
.background_tasks
,
response_id
)
.await
}
async
fn
route_classify
(
...
...
sgl-router/src/routers/openai/conversations.rs
View file @
70f6309c
...
...
@@ -62,7 +62,10 @@ pub(super) async fn create_conversation(
None
=>
None
,
};
let
new_conv
=
NewConversation
{
metadata
};
let
new_conv
=
NewConversation
{
id
:
None
,
// Generate random ID (OpenAI behavior for POST /v1/conversations)
metadata
,
};
match
conversation_storage
.create_conversation
(
new_conv
)
.await
{
Ok
(
conversation
)
=>
{
...
...
@@ -952,7 +955,7 @@ fn item_to_json(item: &crate::data_connector::conversation_items::ConversationIt
// ============================================================================
/// Persist conversation items (delegates to persist_items_with_storages)
pub
(
super
)
async
fn
persist_conversation_items
(
pub
async
fn
persist_conversation_items
(
conversation_storage
:
Arc
<
dyn
ConversationStorage
>
,
item_storage
:
Arc
<
dyn
ConversationItemStorage
>
,
response_storage
:
Arc
<
dyn
ResponseStorage
>
,
...
...
sgl-router/src/routers/openai/mcp.rs
View file @
70f6309c
...
...
@@ -129,7 +129,7 @@ impl FunctionCallInProgress {
// ============================================================================
/// Build a request-scoped MCP manager from request tools, if present.
pub
(
super
)
async
fn
mcp_manager_from_request_tools
(
pub
async
fn
mcp_manager_from_request_tools
(
tools
:
&
[
ResponseTool
],
)
->
Option
<
Arc
<
McpClientManager
>>
{
let
tool
=
tools
...
...
sgl-router/src/routers/openai/mod.rs
View file @
70f6309c
...
...
@@ -7,8 +7,8 @@
//! - Multi-turn tool execution loops
//! - SSE (Server-Sent Events) streaming
mod
conversations
;
mod
mcp
;
pub
mod
conversations
;
pub
mod
mcp
;
mod
responses
;
mod
router
;
mod
streaming
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment