"deploy/operator/api/v1beta1/groupversion_info.go" did not exist on "cf433e6825d83f41905da47d69ca5ee30d4eb1ba"
Unverified Commit 9498f016 authored by ishandhanani's avatar ishandhanani Committed by GitHub
Browse files

feat(sglang): add ephemeral KV session routing (#7665)


Signed-off-by: default avatarIshan Dhanani <ishandhanani@gmail.com>
parent 6bfc6d1f
......@@ -1384,88 +1384,6 @@ mod tests {
assert_eq!(tools[1].id, "t2");
}
#[test]
fn test_cache_control_passthrough() {
use dynamo_protocols::types::anthropic::{CacheControl, CacheControlType};
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: Some(CacheControl {
control_type: CacheControlType::Ephemeral,
ttl: None,
}),
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_cache_control_1h_ttl_passthrough() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"cache_control": {"type": "ephemeral", "ttl": "1h"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
assert!(req.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_no_cache_control_passthrough() {
let req = AnthropicCreateMessageRequest {
model: "test-model".into(),
max_tokens: 100,
messages: vec![AnthropicMessage {
role: AnthropicRole::User,
content: AnthropicMessageContent::Text {
content: "Hello".into(),
},
}],
system: None,
temperature: None,
top_p: None,
top_k: None,
stop_sequences: None,
stream: false,
metadata: None,
tools: None,
tool_choice: None,
cache_control: None,
thinking: None,
service_tier: None,
container: None,
output_config: None,
};
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_per_block_cache_control_deserialization() {
let json = r#"{
......@@ -1499,67 +1417,6 @@ mod tests {
}
}
#[test]
fn test_per_block_cache_control_last_wins() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "system context", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "recent context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_top_level_cache_control_overrides_per_block() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
],
"cache_control": {"type": "ephemeral"}
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_system_block_array_with_cache_control() {
let json = r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": [
{"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "Be concise."}
]
}"#;
let req: AnthropicCreateMessageRequest = serde_json::from_str(json).unwrap();
let system = req.system.as_ref().unwrap();
assert_eq!(system.text, "You are a helpful assistant.\nBe concise.");
// The LAST block with cache_control wins (first block here)
assert!(system.cache_control.is_some());
let chat_req: NvCreateChatCompletionRequest = req.try_into().unwrap();
assert!(chat_req.nvext.is_none());
}
#[test]
fn test_system_string_no_cache_control() {
let json = r#"{
......
......@@ -66,6 +66,11 @@ pub struct RoutingHints {
/// When set, only workers in this set are considered during scoring.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub allowed_worker_ids: Option<HashSet<WorkerId>>,
/// Session control for subagent KV isolation and sticky routing.
/// Contains session_id (for affinity) and optional action (open/close).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub session_control: Option<crate::protocols::openai::nvext::SessionControl>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Default)]
......
......@@ -202,6 +202,14 @@ pub struct NvExt {
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub request_timestamp_ms: Option<f64>,
/// Session control for subagent KV isolation and sticky routing.
/// When present, the router uses `session_id` for worker affinity.
/// When `action` is set to `open` or `close`, the router also fires
/// session lifecycle RPCs to the worker.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub session_control: Option<SessionControl>,
}
/// Hints from the agent/caller about request characteristics.
......@@ -237,6 +245,36 @@ pub struct AgentHints {
pub latency_sensitivity: Option<f64>,
}
fn default_session_timeout() -> u64 {
300
}
/// Session control for subagent KV isolation and sticky routing.
///
/// Always requires `session_id`. The `action` field is optional:
/// - `action: "open"` on the first turn creates a streaming session on the worker
/// - `action: "close"` on the last turn frees session KV after generation
/// - No `action` on intermediate turns -- just provides `session_id` for sticky routing
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct SessionControl {
/// Unique session identifier. Present on every turn for sticky routing.
pub session_id: String,
/// Lifecycle action: `"open"` or `"close"`. Omit on intermediate turns.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub action: Option<SessionAction>,
/// Inactivity timeout in seconds (default 300, only used with `action: "open"`).
#[serde(default = "default_session_timeout")]
pub timeout: u64,
}
/// Session lifecycle actions.
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SessionAction {
Open,
Close,
}
impl Default for NvExt {
fn default() -> Self {
NvExt::builder().build().unwrap()
......@@ -285,6 +323,7 @@ mod tests {
assert_eq!(nv_ext.decode_worker_id, None);
assert_eq!(nv_ext.agent_hints, None);
assert_eq!(nv_ext.request_timestamp_ms, None);
assert_eq!(nv_ext.session_control, None);
}
// Test valid builder configurations
......@@ -324,6 +363,47 @@ mod tests {
assert!(nv_ext.validate().is_ok());
}
#[test]
fn test_session_control_serde() {
// Open action with timeout
let sc_json = r#"{"session_id": "sub-1", "action": "open", "timeout": 60}"#;
let sc: SessionControl = serde_json::from_str(sc_json).unwrap();
assert_eq!(sc.action, Some(SessionAction::Open));
assert_eq!(sc.session_id, "sub-1");
assert_eq!(sc.timeout, 60);
// Close action (timeout defaults to 300)
let sc_close = r#"{"session_id": "sub-1", "action": "close"}"#;
let sc: SessionControl = serde_json::from_str(sc_close).unwrap();
assert_eq!(sc.action, Some(SessionAction::Close));
assert_eq!(sc.timeout, 300);
// Continue (no action, just session_id for sticky routing)
let sc_continue = r#"{"session_id": "sub-1"}"#;
let sc: SessionControl = serde_json::from_str(sc_continue).unwrap();
assert_eq!(sc.action, None);
assert_eq!(sc.session_id, "sub-1");
// NvExt with session_control
let nvext_json =
r#"{"session_control": {"session_id": "sub-2", "action": "open", "timeout": 300}}"#;
let nvext: NvExt = serde_json::from_str(nvext_json).unwrap();
assert!(nvext.session_control.is_some());
let sc = nvext.session_control.unwrap();
assert_eq!(sc.action, Some(SessionAction::Open));
assert_eq!(sc.session_id, "sub-2");
// Roundtrip
let original = SessionControl {
session_id: "test-session".to_string(),
action: Some(SessionAction::Close),
timeout: 90,
};
let json = serde_json::to_string(&original).unwrap();
let deser: SessionControl = serde_json::from_str(&json).unwrap();
assert_eq!(deser, original);
}
#[test]
fn test_apply_header_routing_overrides() {
use axum::http::HeaderMap;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment