feat: query instance_id based on routing strategy (#1787)

f3d784f3 · Biswa Panda · GitHub · 13560ab2 · f3d784f3 · f3d784f3
Unverified Commit f3d784f3 authored Jul 23, 2025 by Biswa Panda Committed by GitHub Jul 23, 2025
Show whitespace changes
Inline Side-by-side

Showing with 62 additions and 50 deletions

lib/llm/src/kv_router.rs lib/llm/src/kv_router.rs +61 -49

lib/llm/src/preprocessor.rs lib/llm/src/preprocessor.rs +1 -1

No files found.
--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -313,19 +313,31 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
            InstanceSource::Dynamic(_) => {
                // Extract context ID for request tracking
                let context_id = request.context().id().to_string();
                let (instance_id, overlap_amount) = self
                    .chooser
                    .find_best_match(&context_id, &request.token_ids)
                    .await?;
+                let query_instance_id = request.has_annotation("query_instance_id");
+                // Extract context information before moving the request
+                let stream_context = request.context().clone();
                // Update the request with the estimated prefix hit blocks
                let (mut backend_input, context) = request.into_parts();
                let isl = backend_input.token_ids.len();
                backend_input.estimated_prefix_hit_num_blocks = Some(overlap_amount);
                let updated_request = context.map(|_| backend_input);
+                // if request has the annotation "query_instance_id", for example
+                // curl -d '{... ,"nvext": { "annotations": ["query_instance_id"]}}'
+                // request will not be routed to worker immediately
+                if query_instance_id {
+                    let instance_id_str = instance_id.to_string();
+                    let response =
+                        Annotated::from_annotation("worker_instance_id", &instance_id_str)?;
+                    let stream = stream::iter(vec![response]);
+                    Ok(ResponseStream::new(Box::pin(stream), stream_context))
+                } else {
                    // Get the response stream from the worker
-                let mut response_stream = self.inner.direct(updated_request, instance_id).await?;
+                    let mut response_stream =
+                        self.inner.direct(updated_request, instance_id).await?;
                    // Wrap the stream to track tokens
                    let stream_context = response_stream.context();
@@ -374,9 +386,9 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
                        chooser.free(&request_id).await;
                    });
                    Ok(ResponseStream::new(wrapped_stream, stream_context))
                }
            }
        }
+    }
 }
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -397,9 +397,9 @@ impl OpenAIPreprocessor {
                        // Only set event if not already set to avoid overriding existing events (like errors)
                        if response.event.is_none() {
                            response.event = metrics_annotated.event;
-                        }
                            response.comment = metrics_annotated.comment;
                        }
+                    }
                    tracing::trace!(
                        request_id = inner.context.id(),