Unverified Commit bc514fbe authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat: router priority queue (#6010)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarishandhanani <82981111+ishandhanani@users.noreply.github.com>
parent 4673e47f
...@@ -51,6 +51,12 @@ pub struct RoutingHints { ...@@ -51,6 +51,12 @@ pub struct RoutingHints {
/// Used for LORA-aware routing and tracking. /// Used for LORA-aware routing and tracking.
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub lora_name: Option<String>, pub lora_name: Option<String>,
/// Priority jump in seconds for queue ordering.
/// A positive value decreases the effective arrival time, moving the request
/// ahead in the scheduler queue.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub priority_jump: Option<f64>,
} }
#[derive(Serialize, Deserialize, Debug, Clone, Default)] #[derive(Serialize, Deserialize, Debug, Clone, Default)]
......
...@@ -180,11 +180,27 @@ pub struct NvExt { ...@@ -180,11 +180,27 @@ pub struct NvExt {
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub enable_local_updates: Option<bool>, pub enable_local_updates: Option<bool>,
/// Expected number of output tokens for this request. /// Agent-provided hints for request handling.
/// Used as a hint for routing decisions to estimate resource requirements.
#[builder(default, setter(strip_option))] #[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_output_tokens: Option<u32>, pub agent_hints: Option<AgentHints>,
}
/// Hints from the agent/caller about request characteristics.
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default)]
pub struct AgentHints {
/// Latency sensitivity in seconds for queue ordering.
/// Higher values cause the request to be scheduled sooner when the router queue is enabled.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub latency_sensitivity: Option<f64>,
/// Expected output sequence length (number of output tokens).
/// Used as a hint for routing decisions to estimate resource requirements
/// and for output block tracking decay.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub osl: Option<u32>,
} }
impl Default for NvExt { impl Default for NvExt {
...@@ -234,7 +250,6 @@ mod tests { ...@@ -234,7 +250,6 @@ mod tests {
assert_eq!(nv_ext.prefill_worker_id, None); assert_eq!(nv_ext.prefill_worker_id, None);
assert_eq!(nv_ext.decode_worker_id, None); assert_eq!(nv_ext.decode_worker_id, None);
assert_eq!(nv_ext.enable_local_updates, None); assert_eq!(nv_ext.enable_local_updates, None);
assert_eq!(nv_ext.expected_output_tokens, None);
} }
// Test valid builder configurations // Test valid builder configurations
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment