Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9498f016
Unverified
Commit
9498f016
authored
Apr 13, 2026
by
ishandhanani
Committed by
GitHub
Apr 13, 2026
Browse files
feat(sglang): add ephemeral KV session routing (#7665)
Signed-off-by:
Ishan Dhanani
<
ishandhanani@gmail.com
>
parent
6bfc6d1f
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
85 additions
and
143 deletions
+85
-143
lib/llm/src/protocols/anthropic/types.rs
lib/llm/src/protocols/anthropic/types.rs
+0
-143
lib/llm/src/protocols/common/preprocessor.rs
lib/llm/src/protocols/common/preprocessor.rs
+5
-0
lib/llm/src/protocols/openai/nvext.rs
lib/llm/src/protocols/openai/nvext.rs
+80
-0
No files found.
lib/llm/src/protocols/anthropic/types.rs
View file @
9498f016
...
@@ -1384,88 +1384,6 @@ mod tests {
...
@@ -1384,88 +1384,6 @@ mod tests {
assert_eq!
(
tools
[
1
]
.id
,
"t2"
);
assert_eq!
(
tools
[
1
]
.id
,
"t2"
);
}
}
#[test]
fn
test_cache_control_passthrough
()
{
use
dynamo_protocols
::
types
::
anthropic
::{
CacheControl
,
CacheControlType
};
let
req
=
AnthropicCreateMessageRequest
{
model
:
"test-model"
.into
(),
max_tokens
:
100
,
messages
:
vec!
[
AnthropicMessage
{
role
:
AnthropicRole
::
User
,
content
:
AnthropicMessageContent
::
Text
{
content
:
"Hello"
.into
(),
},
}],
system
:
None
,
temperature
:
None
,
top_p
:
None
,
top_k
:
None
,
stop_sequences
:
None
,
stream
:
false
,
metadata
:
None
,
tools
:
None
,
tool_choice
:
None
,
cache_control
:
Some
(
CacheControl
{
control_type
:
CacheControlType
::
Ephemeral
,
ttl
:
None
,
}),
thinking
:
None
,
service_tier
:
None
,
container
:
None
,
output_config
:
None
,
};
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
fn
test_cache_control_1h_ttl_passthrough
()
{
let
json
=
r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"cache_control": {"type": "ephemeral", "ttl": "1h"}
}"#
;
let
req
:
AnthropicCreateMessageRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
assert
!
(
req
.cache_control
.is_some
());
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
fn
test_no_cache_control_passthrough
()
{
let
req
=
AnthropicCreateMessageRequest
{
model
:
"test-model"
.into
(),
max_tokens
:
100
,
messages
:
vec!
[
AnthropicMessage
{
role
:
AnthropicRole
::
User
,
content
:
AnthropicMessageContent
::
Text
{
content
:
"Hello"
.into
(),
},
}],
system
:
None
,
temperature
:
None
,
top_p
:
None
,
top_k
:
None
,
stop_sequences
:
None
,
stream
:
false
,
metadata
:
None
,
tools
:
None
,
tool_choice
:
None
,
cache_control
:
None
,
thinking
:
None
,
service_tier
:
None
,
container
:
None
,
output_config
:
None
,
};
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
#[test]
fn
test_per_block_cache_control_deserialization
()
{
fn
test_per_block_cache_control_deserialization
()
{
let
json
=
r#"{
let
json
=
r#"{
...
@@ -1499,67 +1417,6 @@ mod tests {
...
@@ -1499,67 +1417,6 @@ mod tests {
}
}
}
}
#[test]
fn
test_per_block_cache_control_last_wins
()
{
let
json
=
r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "system context", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "recent context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
]
}"#
;
let
req
:
AnthropicCreateMessageRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
fn
test_top_level_cache_control_overrides_per_block
()
{
let
json
=
r#"{
"model": "test",
"max_tokens": 100,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "context", "cache_control": {"type": "ephemeral", "ttl": "1h"}}
]
}
],
"cache_control": {"type": "ephemeral"}
}"#
;
let
req
:
AnthropicCreateMessageRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
fn
test_system_block_array_with_cache_control
()
{
let
json
=
r#"{
"model": "test",
"max_tokens": 100,
"messages": [{"role": "user", "content": "Hello"}],
"system": [
{"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}},
{"type": "text", "text": "Be concise."}
]
}"#
;
let
req
:
AnthropicCreateMessageRequest
=
serde_json
::
from_str
(
json
)
.unwrap
();
let
system
=
req
.system
.as_ref
()
.unwrap
();
assert_eq!
(
system
.text
,
"You are a helpful assistant.
\n
Be concise."
);
// The LAST block with cache_control wins (first block here)
assert
!
(
system
.cache_control
.is_some
());
let
chat_req
:
NvCreateChatCompletionRequest
=
req
.try_into
()
.unwrap
();
assert
!
(
chat_req
.nvext
.is_none
());
}
#[test]
#[test]
fn
test_system_string_no_cache_control
()
{
fn
test_system_string_no_cache_control
()
{
let
json
=
r#"{
let
json
=
r#"{
...
...
lib/llm/src/protocols/common/preprocessor.rs
View file @
9498f016
...
@@ -66,6 +66,11 @@ pub struct RoutingHints {
...
@@ -66,6 +66,11 @@ pub struct RoutingHints {
/// When set, only workers in this set are considered during scoring.
/// When set, only workers in this set are considered during scoring.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
allowed_worker_ids
:
Option
<
HashSet
<
WorkerId
>>
,
pub
allowed_worker_ids
:
Option
<
HashSet
<
WorkerId
>>
,
/// Session control for subagent KV isolation and sticky routing.
/// Contains session_id (for affinity) and optional action (open/close).
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
session_control
:
Option
<
crate
::
protocols
::
openai
::
nvext
::
SessionControl
>
,
}
}
#[derive(Serialize,
Deserialize,
Debug,
Clone,
Default)]
#[derive(Serialize,
Deserialize,
Debug,
Clone,
Default)]
...
...
lib/llm/src/protocols/openai/nvext.rs
View file @
9498f016
...
@@ -202,6 +202,14 @@ pub struct NvExt {
...
@@ -202,6 +202,14 @@ pub struct NvExt {
#[builder(default,
setter(strip_option))]
#[builder(default,
setter(strip_option))]
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
request_timestamp_ms
:
Option
<
f64
>
,
pub
request_timestamp_ms
:
Option
<
f64
>
,
/// Session control for subagent KV isolation and sticky routing.
/// When present, the router uses `session_id` for worker affinity.
/// When `action` is set to `open` or `close`, the router also fires
/// session lifecycle RPCs to the worker.
#[builder(default,
setter(strip_option))]
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
session_control
:
Option
<
SessionControl
>
,
}
}
/// Hints from the agent/caller about request characteristics.
/// Hints from the agent/caller about request characteristics.
...
@@ -237,6 +245,36 @@ pub struct AgentHints {
...
@@ -237,6 +245,36 @@ pub struct AgentHints {
pub
latency_sensitivity
:
Option
<
f64
>
,
pub
latency_sensitivity
:
Option
<
f64
>
,
}
}
fn
default_session_timeout
()
->
u64
{
300
}
/// Session control for subagent KV isolation and sticky routing.
///
/// Always requires `session_id`. The `action` field is optional:
/// - `action: "open"` on the first turn creates a streaming session on the worker
/// - `action: "close"` on the last turn frees session KV after generation
/// - No `action` on intermediate turns -- just provides `session_id` for sticky routing
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
pub
struct
SessionControl
{
/// Unique session identifier. Present on every turn for sticky routing.
pub
session_id
:
String
,
/// Lifecycle action: `"open"` or `"close"`. Omit on intermediate turns.
#[serde(default,
skip_serializing_if
=
"Option::is_none"
)]
pub
action
:
Option
<
SessionAction
>
,
/// Inactivity timeout in seconds (default 300, only used with `action: "open"`).
#[serde(default
=
"default_session_timeout"
)]
pub
timeout
:
u64
,
}
/// Session lifecycle actions.
#[derive(ToSchema,
Serialize,
Deserialize,
Debug,
Clone,
PartialEq)]
#[serde(rename_all
=
"snake_case"
)]
pub
enum
SessionAction
{
Open
,
Close
,
}
impl
Default
for
NvExt
{
impl
Default
for
NvExt
{
fn
default
()
->
Self
{
fn
default
()
->
Self
{
NvExt
::
builder
()
.build
()
.unwrap
()
NvExt
::
builder
()
.build
()
.unwrap
()
...
@@ -285,6 +323,7 @@ mod tests {
...
@@ -285,6 +323,7 @@ mod tests {
assert_eq!
(
nv_ext
.decode_worker_id
,
None
);
assert_eq!
(
nv_ext
.decode_worker_id
,
None
);
assert_eq!
(
nv_ext
.agent_hints
,
None
);
assert_eq!
(
nv_ext
.agent_hints
,
None
);
assert_eq!
(
nv_ext
.request_timestamp_ms
,
None
);
assert_eq!
(
nv_ext
.request_timestamp_ms
,
None
);
assert_eq!
(
nv_ext
.session_control
,
None
);
}
}
// Test valid builder configurations
// Test valid builder configurations
...
@@ -324,6 +363,47 @@ mod tests {
...
@@ -324,6 +363,47 @@ mod tests {
assert
!
(
nv_ext
.validate
()
.is_ok
());
assert
!
(
nv_ext
.validate
()
.is_ok
());
}
}
#[test]
fn
test_session_control_serde
()
{
// Open action with timeout
let
sc_json
=
r#"{"session_id": "sub-1", "action": "open", "timeout": 60}"#
;
let
sc
:
SessionControl
=
serde_json
::
from_str
(
sc_json
)
.unwrap
();
assert_eq!
(
sc
.action
,
Some
(
SessionAction
::
Open
));
assert_eq!
(
sc
.session_id
,
"sub-1"
);
assert_eq!
(
sc
.timeout
,
60
);
// Close action (timeout defaults to 300)
let
sc_close
=
r#"{"session_id": "sub-1", "action": "close"}"#
;
let
sc
:
SessionControl
=
serde_json
::
from_str
(
sc_close
)
.unwrap
();
assert_eq!
(
sc
.action
,
Some
(
SessionAction
::
Close
));
assert_eq!
(
sc
.timeout
,
300
);
// Continue (no action, just session_id for sticky routing)
let
sc_continue
=
r#"{"session_id": "sub-1"}"#
;
let
sc
:
SessionControl
=
serde_json
::
from_str
(
sc_continue
)
.unwrap
();
assert_eq!
(
sc
.action
,
None
);
assert_eq!
(
sc
.session_id
,
"sub-1"
);
// NvExt with session_control
let
nvext_json
=
r#"{"session_control": {"session_id": "sub-2", "action": "open", "timeout": 300}}"#
;
let
nvext
:
NvExt
=
serde_json
::
from_str
(
nvext_json
)
.unwrap
();
assert
!
(
nvext
.session_control
.is_some
());
let
sc
=
nvext
.session_control
.unwrap
();
assert_eq!
(
sc
.action
,
Some
(
SessionAction
::
Open
));
assert_eq!
(
sc
.session_id
,
"sub-2"
);
// Roundtrip
let
original
=
SessionControl
{
session_id
:
"test-session"
.to_string
(),
action
:
Some
(
SessionAction
::
Close
),
timeout
:
90
,
};
let
json
=
serde_json
::
to_string
(
&
original
)
.unwrap
();
let
deser
:
SessionControl
=
serde_json
::
from_str
(
&
json
)
.unwrap
();
assert_eq!
(
deser
,
original
);
}
#[test]
#[test]
fn
test_apply_header_routing_overrides
()
{
fn
test_apply_header_routing_overrides
()
{
use
axum
::
http
::
HeaderMap
;
use
axum
::
http
::
HeaderMap
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment