Unverified Commit bc514fbe authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat: router priority queue (#6010)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarishandhanani <82981111+ishandhanani@users.noreply.github.com>
parent 4673e47f
......@@ -51,6 +51,12 @@ pub struct RoutingHints {
/// Used for LORA-aware routing and tracking.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub lora_name: Option<String>,
/// Priority jump in seconds for queue ordering.
/// A positive value decreases the effective arrival time, moving the request
/// ahead in the scheduler queue.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub priority_jump: Option<f64>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Default)]
......
......@@ -180,11 +180,27 @@ pub struct NvExt {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub enable_local_updates: Option<bool>,
/// Expected number of output tokens for this request.
/// Used as a hint for routing decisions to estimate resource requirements.
/// Agent-provided hints for request handling.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected_output_tokens: Option<u32>,
pub agent_hints: Option<AgentHints>,
}
/// Hints from the agent/caller about request characteristics.
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default)]
pub struct AgentHints {
/// Latency sensitivity in seconds for queue ordering.
/// Higher values cause the request to be scheduled sooner when the router queue is enabled.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub latency_sensitivity: Option<f64>,
/// Expected output sequence length (number of output tokens).
/// Used as a hint for routing decisions to estimate resource requirements
/// and for output block tracking decay.
#[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub osl: Option<u32>,
}
impl Default for NvExt {
......@@ -234,7 +250,6 @@ mod tests {
assert_eq!(nv_ext.prefill_worker_id, None);
assert_eq!(nv_ext.decode_worker_id, None);
assert_eq!(nv_ext.enable_local_updates, None);
assert_eq!(nv_ext.expected_output_tokens, None);
}
// Test valid builder configurations
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment