feat: skip router when worker id is pre-determined (#2450)

Co-authored-by: Biswa Panda <biswa.panda@gmail.com>

feat: skip router when worker id is pre-determined (#2450)
Co-authored-by: Biswa Panda <biswa.panda@gmail.com>
6bc6d400 · atchernych · GitHub · 57d4fa05 · 6bc6d400 · 6bc6d400
Unverified Commit 6bc6d400 authored Aug 19, 2025 by atchernych Committed by GitHub Aug 19, 2025
6 changed files
--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -350,10 +350,16 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
            InstanceSource::Dynamic(_) => {
                // Extract context ID for request tracking
                let context_id = request.context().id().to_string();
-                let (instance_id, overlap_amount) = self
+                let (instance_id, overlap_amount) = if let Some(id) = request.backend_instance_id {
-                    .chooser
+                    // If instance_id is set, use it
-                    .find_best_match(&context_id, &request.token_ids)
+                    (id, 0)
-                    .await?;
+                } else {
+                    // Otherwise, find the best match
+                    self.chooser
+                        .find_best_match(&context_id, &request.token_ids)
+                        .await?
+                };
                let query_instance_id = request.has_annotation("query_instance_id");
                // Extract context information before moving the request
                let stream_context = request.context().clone();

--- a/lib/llm/src/migration.rs
+++ b/lib/llm/src/migration.rs
@@ -188,6 +188,7 @@ mod tests {
            mdc_sum: None,
            annotations: vec![],
            estimated_prefix_hit_num_blocks: None,
+            backend_instance_id: None,
        }
    }

--- a/lib/llm/src/mocker/engine.rs
+++ b/lib/llm/src/mocker/engine.rs
@@ -646,6 +646,7 @@ mod integration_tests {
            mdc_sum: None,
            annotations: vec![format!("dp_rank:{dp_rank}")],
            estimated_prefix_hit_num_blocks: None,
+            backend_instance_id: None,
        };
        let requests = vec![

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -254,6 +254,10 @@ impl OpenAIPreprocessor {
        builder.annotations(request.annotations().unwrap_or_default());
        builder.mdc_sum(Some(self.mdcsum.clone()));
        builder.estimated_prefix_hit_num_blocks(None);
+        // Extract backend_instance_id from nvext if present
+        if let Some(nvext) = request.nvext() {
+            builder.backend_instance_id(nvext.backend_instance_id);
+        }
        Ok((builder.build()?, annotations))
    }

--- a/lib/llm/src/protocols/common/preprocessor.rs
+++ b/lib/llm/src/protocols/common/preprocessor.rs
@@ -50,6 +50,10 @@ pub struct PreprocessedRequest {
    /// Estimated number of prefix hit tokens (only used in kv aware routing)
    #[builder(default)]
    pub estimated_prefix_hit_num_blocks: Option<u32>,
+    /// Targeted backend instance ID for the request
+    #[builder(default)]
+    pub backend_instance_id: Option<i64>,
 }
 impl PreprocessedRequest {

--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -62,6 +62,12 @@ pub struct NvExt {
    #[builder(default, setter(strip_option))]
    pub annotations: Option<Vec<String>>,
+    /// Targeted backend instance ID for the request
+    /// If set, the request will be routed to backend instance with the given ID.
+    /// If not set, the request will be routed to the best matching instance.
+    #[builder(default, setter(strip_option))]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub backend_instance_id: Option<i64>,
    /// Guided Decoding Options
    /// If specified, the output will be a JSON object. Can be a string, an object, or null.
    #[serde(default, skip_serializing_if = "Option::is_none")]