Unverified Commit 29813508 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

feat(dynamo-run): KV-aware routing (#1064)

Router:
```
dynamo-run in=http out=dyn://dynamo.endpoint.generate --router-mode kv
```

Worker (* N):
```
dynamo-run in=dyn://dynamo.endpoint.generate out=vllm /data/llms/Qwen/Qwen3-4B
```

You need patched vllm and the C bindings `.so`. Full docs in the updated guide: `docs/guides/dynamo_run.md`.

This gives us a pure-Rust ingress node: OpenAI compliant HTTP server + Pre-processor + KV-aware router.
parent b82e7327
...@@ -18,7 +18,7 @@ use std::sync::Arc; ...@@ -18,7 +18,7 @@ use std::sync::Arc;
use dynamo_llm::{ use dynamo_llm::{
http::service::{ http::service::{
discovery::{model_watcher, ModelWatchState}, discovery::{LLMRouterMode, ModelWatcher},
service_v2::HttpService, service_v2::HttpService,
}, },
model_type::ModelType, model_type::ModelType,
...@@ -83,18 +83,22 @@ async fn app(runtime: Runtime) -> Result<()> { ...@@ -83,18 +83,22 @@ async fn app(runtime: Runtime) -> Result<()> {
for model_type in [ModelType::Chat, ModelType::Completion] { for model_type in [ModelType::Chat, ModelType::Completion] {
let etcd_path = format!("{}/models/{}/", etcd_root, model_type.as_str()); let etcd_path = format!("{}/models/{}/", etcd_root, model_type.as_str());
let state = Arc::new(ModelWatchState { let watch_obj = Arc::new(
prefix: etcd_path.clone(), ModelWatcher::new(
manager: manager.clone(), component.clone(),
drt: distributed.clone(), manager.clone(),
}); &etcd_path,
LLMRouterMode::Random,
)
.await?,
);
if let Some(etcd_client) = distributed.etcd_client() { if let Some(etcd_client) = distributed.etcd_client() {
let models_watcher: PrefixWatcher = let models_watcher: PrefixWatcher =
etcd_client.kv_get_and_watch_prefix(etcd_path).await?; etcd_client.kv_get_and_watch_prefix(etcd_path).await?;
let (_prefix, _watcher, receiver) = models_watcher.dissolve(); let (_prefix, _watcher, receiver) = models_watcher.dissolve();
tokio::spawn(model_watcher(state, receiver)); tokio::spawn(watch_obj.watch(receiver));
} }
} }
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
// 2. Update the backend component to produce a config in a standard location. // 2. Update the backend component to produce a config in a standard location.
// 3. Update the KvRouter to read the config from the backend component. // 3. Update the KvRouter to read the config from the backend component.
use std::sync::Arc;
use clap::Parser; use clap::Parser;
use dynamo_llm::kv_router::{ use dynamo_llm::kv_router::{
...@@ -65,7 +67,7 @@ async fn app(runtime: Runtime) -> Result<()> { ...@@ -65,7 +67,7 @@ async fn app(runtime: Runtime) -> Result<()> {
let selector = Box::new(CustomWorkerSelector::default()); let selector = Box::new(CustomWorkerSelector::default());
let router = KvRouter::new(component.clone(), args.block_size, Some(selector)).await?; let router = KvRouter::new(component.clone(), args.block_size, Some(selector)).await?;
let router = Ingress::for_engine(router)?; let router = Ingress::for_engine(Arc::new(router))?;
component component
.service_builder() .service_builder()
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* [Automatically download a model from Hugging Face](#use-model-from-hugging-face) * [Automatically download a model from Hugging Face](#use-model-from-hugging-face)
* [Run a model from local file](#run-a-model-from-local-file) * [Run a model from local file](#run-a-model-from-local-file)
* [Distributed system](#distributed-system) * [Distributed system](#distributed-system)
* [KV-aware routing](#kv-aware-routing)
* [Full usage details](#full-usage-details) * [Full usage details](#full-usage-details)
* [Setup](#setup) * [Setup](#setup)
* [mistral.rs](#mistralrs) * [mistral.rs](#mistralrs)
...@@ -23,7 +24,7 @@ It supports the following engines: mistralrs, llamacpp, sglang, vllm and tensorr ...@@ -23,7 +24,7 @@ It supports the following engines: mistralrs, llamacpp, sglang, vllm and tensorr
Usage: Usage:
``` ```
dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin] dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv]
``` ```
Example: `dynamo run Qwen/Qwen3-0.6B` Example: `dynamo run Qwen/Qwen3-0.6B`
...@@ -111,6 +112,43 @@ The `llama3B_pool` name is purely symbolic, pick anything as long as it matches ...@@ -111,6 +112,43 @@ The `llama3B_pool` name is purely symbolic, pick anything as long as it matches
Run `dynamo-run --help` for more options. Run `dynamo-run --help` for more options.
### KV-aware routing
**Setup**
Only patched vllm currently supports KV-aware routing. Key setup steps:
1. `etcd` and `nats` (see earlier) must be running and accessible from all nodes.
1. Create a virtualenv: `uv venv kvtest`, source it's `activate`.
1. EITHER install Dynamo's vllm branch: `uv pip install ai-dynamo-vllm`,
1. OR install upstream vllm 0.8.4 (`uv pip install vllm==0.8.4`) and patch it: `cd kvtest/lib/python3.12/site-packages`, `patch -p1 < $REPO_ROOT/container/deps/vllm/vllm_v0.8.4-dynamo-kv-disagg-patch.patch`.
1. Build the C bindings. `cd $REPO_ROOT/lib/bindings/c`. `cargo build`.
1. Put the library you just built on library path: `export LD_LIBRARY_PATH=$REPO_ROOT/target/debug/`.
If you patched locally (instead of installing `ai-dynamo-vllm`) you will need to edit vllm's `platforms/__init__.py` to undo a patch change:
```
#vllm_version = version("ai_dynamo_vllm")
vllm_version = version("vllm")
```
**Start the workers**
The workers are started normally.
```
dynamo-run in=dyn://dynamo.endpoint.generate out=vllm /data/llms/Qwen/Qwen3-4B
```
**Start the ingress node**
```
dynamo-run in=http out=dyn://dynamo.endpoint.generate --router-mode kv
```
The only difference from the distributed system above is `--router-mode kv`. The patched vllm will announce when a KV block is created or removed. The Dynamo router run will find the worker with the best match for those KV blocks and direct the traffic to that node.
For performance testing compare a typical workload with `--router-mode random|round-robin` to see if it will benefit from KV-aware routing.
## Full usage details ## Full usage details
`dynamo-run` is what `dynamo run` executes. It is also an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide demonstrates how you can build from source with all the features. `dynamo-run` is what `dynamo run` executes. It is also an example of what you can build in Rust with the `dynamo-llm` and `dynamo-runtime` crates. The following guide demonstrates how you can build from source with all the features.
...@@ -427,7 +465,7 @@ async def worker(runtime: DistributedRuntime): ...@@ -427,7 +465,7 @@ async def worker(runtime: DistributedRuntime):
# #
component = runtime.namespace("namespace").component("component") component = runtime.namespace("namespace").component("component")
await component.create_service() await component.create_service()
model_path = "Qwen/Qwen2.5-0.5B-Instruct" # or "/data/models/Qwen2.5-0.5B-Instruct" model_path = "Qwen/Qwen3-0.6B" # or "/data/models/Qwen3-0.6B"
model_type = ModelType.Backend model_type = ModelType.Backend
endpoint = component.endpoint("endpoint") endpoint = component.endpoint("endpoint")
# Optional last param to register_llm is model_name. If not present derives it from model_path # Optional last param to register_llm is model_name. If not present derives it from model_path
...@@ -457,7 +495,7 @@ if __name__ == "__main__": ...@@ -457,7 +495,7 @@ if __name__ == "__main__":
The `model_path` can be: The `model_path` can be:
- A HuggingFace repo ID. It will be downloaded and cached locally. - A HuggingFace repo ID, optionally prefixed with `hf://`. It will be downloaded and cached locally.
- The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`. - The path to a checkout of a HuggingFace repo - any folder containing safetensor files as well as `config.json`, `tokenizer.json` and `tokenizer_config.json`.
- The path to a GGUF file, if your engine supports that. - The path to a GGUF file, if your engine supports that.
......
...@@ -17,6 +17,7 @@ use std::collections::HashMap; ...@@ -17,6 +17,7 @@ use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
use clap::ValueEnum; use clap::ValueEnum;
use dynamo_llm::http::service::discovery::LLMRouterMode;
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode; use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
/// Required options depend on the in and out choices /// Required options depend on the in and out choices
...@@ -184,7 +185,7 @@ impl Flags { ...@@ -184,7 +185,7 @@ impl Flags {
} }
} }
#[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug)] #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]
pub enum RouterMode { pub enum RouterMode {
#[default] #[default]
Random, Random,
...@@ -198,14 +199,21 @@ impl RouterMode { ...@@ -198,14 +199,21 @@ impl RouterMode {
pub fn is_kv_routing(&self) -> bool { pub fn is_kv_routing(&self) -> bool {
*self == RouterMode::KV *self == RouterMode::KV
} }
}
impl From<RouterMode> for RuntimeRouterMode { pub fn as_runtime(&self) -> Option<RuntimeRouterMode> {
fn from(r: RouterMode) -> RuntimeRouterMode { match self {
match r { RouterMode::RoundRobin => Some(RuntimeRouterMode::RoundRobin),
RouterMode::RoundRobin => RuntimeRouterMode::RoundRobin, RouterMode::Random => Some(RuntimeRouterMode::Random),
RouterMode::KV => todo!("KV not implemented yet"), // Runtime router does not have KV, it's a dynamo-llm thing, not dynamo-runtime
_ => RuntimeRouterMode::Random, RouterMode::KV => None,
}
}
pub fn as_llm(&self) -> LLMRouterMode {
match self {
RouterMode::RoundRobin => LLMRouterMode::RoundRobin,
RouterMode::Random => LLMRouterMode::Random,
RouterMode::KV => LLMRouterMode::KV,
} }
} }
} }
...@@ -19,6 +19,7 @@ use dynamo_llm::{ ...@@ -19,6 +19,7 @@ use dynamo_llm::{
backend::{Backend, ExecutionContext}, backend::{Backend, ExecutionContext},
engines::StreamingEngineAdapter, engines::StreamingEngineAdapter,
http::service::discovery::ModelNetworkName, http::service::discovery::ModelNetworkName,
kv_router::{scheduler::DefaultWorkerSelector, KvPushRouter, KvRouter},
model_card::ModelDeploymentCard, model_card::ModelDeploymentCard,
model_type::ModelType, model_type::ModelType,
preprocessor::OpenAIPreprocessor, preprocessor::OpenAIPreprocessor,
...@@ -67,8 +68,7 @@ pub async fn prepare_engine( ...@@ -67,8 +68,7 @@ pub async fn prepare_engine(
let client = endpoint.client().await?; let client = endpoint.client().await?;
let mut cache_dir = None; let mut cache_dir = None;
let engine: OpenAIChatCompletionsStreamingEngine = match &flags.router_mode {
RouterMode::Random | RouterMode::RoundRobin => {
tracing::info!("Waiting for remote model.."); tracing::info!("Waiting for remote model..");
let remote_endpoints = client.wait_for_endpoints().await?; let remote_endpoints = client.wait_for_endpoints().await?;
...@@ -82,7 +82,7 @@ pub async fn prepare_engine( ...@@ -82,7 +82,7 @@ pub async fn prepare_engine(
let network_entry = network_name.load_entry(etcd_client.clone()).await?; let network_entry = network_name.load_entry(etcd_client.clone()).await?;
let mut card = network_entry.load_mdc(endpoint_id, etcd_client).await?; let mut card = network_entry.load_mdc(endpoint_id, etcd_client).await?;
match network_entry.model_type { let engine: OpenAIChatCompletionsStreamingEngine = match network_entry.model_type {
ModelType::Backend => { ModelType::Backend => {
// Download tokenizer.json etc to local disk // Download tokenizer.json etc to local disk
cache_dir = Some( cache_dir = Some(
...@@ -97,20 +97,34 @@ pub async fn prepare_engine( ...@@ -97,20 +97,34 @@ pub async fn prepare_engine(
SingleIn<NvCreateChatCompletionRequest>, SingleIn<NvCreateChatCompletionRequest>,
ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>,
>::new(); >::new();
let preprocessor = let preprocessor = OpenAIPreprocessor::new(card.clone()).await?.into_operator();
OpenAIPreprocessor::new(card.clone()).await?.into_operator();
let backend = Backend::from_mdc(card.clone()).await?.into_operator(); let backend = Backend::from_mdc(card.clone()).await?.into_operator();
let router = let router =
PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client( PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client(
client, client,
flags.router_mode.into(), flags.router_mode.as_runtime(),
) )
.await?; .await?;
let service_backend = match &flags.router_mode {
RouterMode::Random | RouterMode::RoundRobin => {
ServiceBackend::from_engine(Arc::new(router))
}
RouterMode::KV => {
let selector = Box::new(DefaultWorkerSelector {});
let chooser = KvRouter::new(
endpoint.component().clone(),
dynamo_llm::DEFAULT_KV_BLOCK_SIZE,
Some(selector),
)
.await?;
let kv_push_router = KvPushRouter::new(router, Arc::new(chooser));
ServiceBackend::from_engine(Arc::new(kv_push_router))
}
};
frontend frontend
.link(preprocessor.forward_edge())? .link(preprocessor.forward_edge())?
.link(backend.forward_edge())? .link(backend.forward_edge())?
.link(ServiceBackend::from_engine(Arc::new(router)))? .link(service_backend)?
.link(backend.backward_edge())? .link(backend.backward_edge())?
.link(preprocessor.backward_edge())? .link(preprocessor.backward_edge())?
.link(frontend)? .link(frontend)?
...@@ -119,13 +133,13 @@ pub async fn prepare_engine( ...@@ -119,13 +133,13 @@ pub async fn prepare_engine(
PushRouter::< PushRouter::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>, Annotated<NvCreateChatCompletionStreamResponse>,
>::from_client( >::from_client(client, flags.router_mode.as_runtime())
client, flags.router_mode.into()
)
.await?, .await?,
), ),
ModelType::Completion => { ModelType::Completion => {
anyhow::bail!("text and batch input only accept remote Chat models, not Completion"); anyhow::bail!(
"text and batch input only accept remote Chat models, not Completion"
);
/* /*
Arc::new( Arc::new(
PushRouter::< PushRouter::<
...@@ -138,11 +152,7 @@ pub async fn prepare_engine( ...@@ -138,11 +152,7 @@ pub async fn prepare_engine(
) )
*/ */
} }
}
}
RouterMode::KV => todo!(),
}; };
// The service_name isn't used for text chat outside of logs, // The service_name isn't used for text chat outside of logs,
// so use the path. That avoids having to listen on etcd for model registration. // so use the path. That avoids having to listen on etcd for model registration.
let service_name = endpoint.subject(); let service_name = endpoint.subject();
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::{pin::Pin, sync::Arc}; use std::{future::Future, pin::Pin, sync::Arc};
use dynamo_llm::{ use dynamo_llm::{
backend::Backend, backend::Backend,
...@@ -52,7 +52,8 @@ pub async fn run( ...@@ -52,7 +52,8 @@ pub async fn run(
.await? .await?
.endpoint(&endpoint_id.name); .endpoint(&endpoint_id.name);
let (rt_fut, mut card) = match engine_config { let (rt_fut, card): (Pin<Box<dyn Future<Output = _> + Send + 'static>>, _) = match engine_config
{
EngineConfig::StaticFull { engine, mut model } => { EngineConfig::StaticFull { engine, mut model } => {
let engine = Arc::new(StreamingEngineAdapter::new(engine)); let engine = Arc::new(StreamingEngineAdapter::new(engine));
let ingress_chat = Ingress::< let ingress_chat = Ingress::<
...@@ -63,7 +64,7 @@ pub async fn run( ...@@ -63,7 +64,7 @@ pub async fn run(
model.attach(&endpoint, ModelType::Chat).await?; model.attach(&endpoint, ModelType::Chat).await?;
let fut_chat = endpoint.endpoint_builder().handler(ingress_chat).start(); let fut_chat = endpoint.endpoint_builder().handler(ingress_chat).start();
(fut_chat, model.card().clone()) (Box::pin(fut_chat), Some(model.card().clone()))
} }
EngineConfig::StaticCore { EngineConfig::StaticCore {
engine: inner_engine, engine: inner_engine,
...@@ -86,10 +87,13 @@ pub async fn run( ...@@ -86,10 +87,13 @@ pub async fn run(
model.attach(&endpoint, ModelType::Backend).await?; model.attach(&endpoint, ModelType::Backend).await?;
let fut = endpoint.endpoint_builder().handler(ingress).start(); let fut = endpoint.endpoint_builder().handler(ingress).start();
(fut, model.card().clone()) (Box::pin(fut), Some(model.card().clone()))
} }
EngineConfig::Dynamic(_) => { EngineConfig::Dynamic(_) => {
anyhow::bail!("Cannot use endpoint for both in and out"); // We can only get here for in=dyn out=vllm|sglang`, because vllm and sglang are a
// subprocess that we talk to like a remote endpoint.
// That means the vllm/sglang subprocess is doing all the work, we are idle.
(never_ready(), None)
} }
}; };
...@@ -102,12 +106,18 @@ pub async fn run( ...@@ -102,12 +106,18 @@ pub async fn run(
} }
// Cleanup on shutdown // Cleanup on shutdown
if let Some(mut card) = card {
if let Err(err) = card if let Err(err) = card
.delete_from_nats(distributed_runtime.nats_client()) .delete_from_nats(distributed_runtime.nats_client())
.await .await
{ {
tracing::error!(%err, "delete_from_nats error on shutdown"); tracing::error!(%err, "delete_from_nats error on shutdown");
} }
}
Ok(()) Ok(())
} }
fn never_ready() -> Pin<Box<dyn Future<Output = anyhow::Result<()>> + Send + 'static>> {
Box::pin(std::future::pending::<anyhow::Result<()>>())
}
...@@ -17,6 +17,7 @@ use std::sync::Arc; ...@@ -17,6 +17,7 @@ use std::sync::Arc;
use crate::input::common; use crate::input::common;
use crate::{EngineConfig, Flags}; use crate::{EngineConfig, Flags};
use dynamo_llm::http::service::discovery::LLMRouterMode;
use dynamo_llm::http::service::ModelManager; use dynamo_llm::http::service::ModelManager;
use dynamo_llm::{ use dynamo_llm::{
engines::StreamingEngineAdapter, engines::StreamingEngineAdapter,
...@@ -29,6 +30,7 @@ use dynamo_llm::{ ...@@ -29,6 +30,7 @@ use dynamo_llm::{
openai::completions::{CompletionRequest, CompletionResponse}, openai::completions::{CompletionRequest, CompletionResponse},
}, },
}; };
use dynamo_runtime::component::Component;
use dynamo_runtime::transports::etcd; use dynamo_runtime::transports::etcd;
use dynamo_runtime::{DistributedRuntime, Runtime}; use dynamo_runtime::{DistributedRuntime, Runtime};
...@@ -59,10 +61,11 @@ pub async fn run( ...@@ -59,10 +61,11 @@ pub async fn run(
// Listen for models registering themselves in etcd, add them to HTTP service // Listen for models registering themselves in etcd, add them to HTTP service
run_watcher( run_watcher(
distributed_runtime.clone(), component.clone(),
http_service.model_manager().clone(), http_service.model_manager().clone(),
etcd_client.clone(), etcd_client.clone(),
&network_prefix, &network_prefix,
flags.router_mode.as_llm(),
) )
.await?; .await?;
} }
...@@ -114,19 +117,18 @@ pub async fn run( ...@@ -114,19 +117,18 @@ pub async fn run(
/// Spawns a task that watches for new models in etcd at network_prefix, /// Spawns a task that watches for new models in etcd at network_prefix,
/// and registers them with the ModelManager so that the HTTP service can use them. /// and registers them with the ModelManager so that the HTTP service can use them.
async fn run_watcher( async fn run_watcher(
distributed_runtime: DistributedRuntime, component: Component,
model_manager: ModelManager, model_manager: ModelManager,
etcd_client: etcd::Client, etcd_client: etcd::Client,
network_prefix: &str, network_prefix: &str,
router_mode: LLMRouterMode,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let state = Arc::new(discovery::ModelWatchState { let watch_obj = Arc::new(
prefix: network_prefix.to_string(), discovery::ModelWatcher::new(component, model_manager, network_prefix, router_mode).await?,
manager: model_manager, );
drt: distributed_runtime.clone(),
});
tracing::info!("Watching for remote model at {network_prefix}"); tracing::info!("Watching for remote model at {network_prefix}");
let models_watcher = etcd_client.kv_get_and_watch_prefix(network_prefix).await?; let models_watcher = etcd_client.kv_get_and_watch_prefix(network_prefix).await?;
let (_prefix, _watcher, receiver) = models_watcher.dissolve(); let (_prefix, _watcher, receiver) = models_watcher.dissolve();
let _watcher_task = tokio::spawn(discovery::model_watcher(state, receiver)); let _watcher_task = tokio::spawn(watch_obj.watch(receiver));
Ok(()) Ok(())
} }
...@@ -22,6 +22,9 @@ const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2); ...@@ -22,6 +22,9 @@ const CHILD_STOP_TIMEOUT: Duration = Duration::from_secs(2);
#[cfg(feature = "python")] #[cfg(feature = "python")]
const PYTHON_STR_SCHEME: &str = "pystr:"; const PYTHON_STR_SCHEME: &str = "pystr:";
/// Where we will attach the vllm/sglang subprocess. Invisible to users.
pub const INTERNAL_ENDPOINT: &str = "dyn://dynamo.internal.worker";
pub enum EngineConfig { pub enum EngineConfig {
/// An remote networked engine we don't know about yet /// An remote networked engine we don't know about yet
Dynamic(Endpoint), Dynamic(Endpoint),
...@@ -45,6 +48,10 @@ pub async fn run( ...@@ -45,6 +48,10 @@ pub async fn run(
out_opt: Output, out_opt: Output,
flags: Flags, flags: Flags,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
if matches!(&in_opt, Input::Endpoint(_)) && matches!(&out_opt, Output::Endpoint(_)) {
anyhow::bail!("Cannot use endpoint for both in and out");
}
let cancel_token = runtime.primary_token(); let cancel_token = runtime.primary_token();
let maybe_path = flags let maybe_path = flags
.model_path_pos .model_path_pos
...@@ -120,6 +127,14 @@ pub async fn run( ...@@ -120,6 +127,14 @@ pub async fn run(
// TODO Does sglang support GGUF? Can we make it work? // TODO Does sglang support GGUF? Can we make it work?
anyhow::bail!("`--model-path should point at a HuggingFace repo checkout"); anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
} }
// If `in=dyn` we want the sglang subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we invent an internal one.
let endpoint = match &in_opt {
Input::Endpoint(path) => path.parse()?,
_ => INTERNAL_ENDPOINT.parse()?,
};
let multi_node_conf = dynamo_llm::engines::MultiNodeConfig { let multi_node_conf = dynamo_llm::engines::MultiNodeConfig {
num_nodes: flags.num_nodes, num_nodes: flags.num_nodes,
node_rank: flags.node_rank, node_rank: flags.node_rank,
...@@ -128,6 +143,7 @@ pub async fn run( ...@@ -128,6 +143,7 @@ pub async fn run(
let (py_script, child) = match subprocess::start( let (py_script, child) = match subprocess::start(
subprocess::sglang::PY, subprocess::sglang::PY,
&local_model, &local_model,
&endpoint,
flags.tensor_parallel_size, flags.tensor_parallel_size,
if flags.base_gpu_id == 0 { if flags.base_gpu_id == 0 {
None None
...@@ -154,16 +170,24 @@ pub async fn run( ...@@ -154,16 +170,24 @@ pub async fn run(
extra = Some(Box::pin(async move { extra = Some(Box::pin(async move {
stopper(cancel_token, child, py_script).await; stopper(cancel_token, child, py_script).await;
})); }));
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
EngineConfig::Dynamic(endpoint) EngineConfig::Dynamic(endpoint)
} }
Output::Vllm => { Output::Vllm => {
if flags.base_gpu_id != 0 { if flags.base_gpu_id != 0 {
anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead."); anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
} }
// If `in=dyn` we want the vllm subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we invent an internal one.
let endpoint = match &in_opt {
Input::Endpoint(path) => path.parse()?,
_ => INTERNAL_ENDPOINT.parse()?,
};
let (py_script, child) = match subprocess::start( let (py_script, child) = match subprocess::start(
subprocess::vllm::PY, subprocess::vllm::PY,
&local_model, &local_model,
&endpoint,
flags.tensor_parallel_size, flags.tensor_parallel_size,
None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead None, // base_gpu_id. vllm uses CUDA_VISIBLE_DEVICES instead
None, // multi-node config. vllm uses `ray`, see guide None, // multi-node config. vllm uses `ray`, see guide
...@@ -182,7 +206,6 @@ pub async fn run( ...@@ -182,7 +206,6 @@ pub async fn run(
extra = Some(Box::pin(async move { extra = Some(Box::pin(async move {
stopper(cancel_token, child, py_script).await; stopper(cancel_token, child, py_script).await;
})); }));
let endpoint: Endpoint = subprocess::ENDPOINT.parse()?;
EngineConfig::Dynamic(endpoint) EngineConfig::Dynamic(endpoint)
} }
......
...@@ -30,7 +30,7 @@ Example: ...@@ -30,7 +30,7 @@ Example:
- OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf - OR: ./dynamo-run /data/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
"#; "#;
const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin]"; const USAGE: &str = "USAGE: dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=ENGINE_LIST|dyn://<path> [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv]";
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
// Set log level based on verbosity flag // Set log level based on verbosity flag
......
...@@ -13,18 +13,18 @@ use tokio::io::AsyncBufReadExt; ...@@ -13,18 +13,18 @@ use tokio::io::AsyncBufReadExt;
use dynamo_llm::engines::MultiNodeConfig; use dynamo_llm::engines::MultiNodeConfig;
use dynamo_llm::LocalModel; use dynamo_llm::LocalModel;
use dynamo_runtime::protocols::Endpoint as EndpointId;
pub mod sglang; pub mod sglang;
pub mod vllm; pub mod vllm;
/// Internal endpoint to connect the subprocess over etcd/nats
pub const ENDPOINT: &str = "dyn://dynamo.internal.worker";
pub async fn start( pub async fn start(
// The Python code to run // The Python code to run
py_script: &'static str, py_script: &'static str,
// Model info // Model info
local_model: &LocalModel, local_model: &LocalModel,
// Endpoint to connect the subprocess over etcd/nats
endpoint: &EndpointId,
// How many GPUs to use // How many GPUs to use
tensor_parallel_size: u32, tensor_parallel_size: u32,
// sglang which GPU to start from, on a multi-GPU system // sglang which GPU to start from, on a multi-GPU system
...@@ -43,13 +43,15 @@ pub async fn start( ...@@ -43,13 +43,15 @@ pub async fn start(
let mut args = vec![ let mut args = vec![
script_path.to_string_lossy().to_string(), script_path.to_string_lossy().to_string(),
"--endpoint".to_string(), "--endpoint".to_string(),
ENDPOINT.to_string(), endpoint.as_url(),
"--model-path".to_string(), "--model-path".to_string(),
local_model.path().to_string_lossy().to_string(), local_model.path().to_string_lossy().to_string(),
"--model-name".to_string(), "--model-name".to_string(),
local_model.display_name().to_string(), local_model.display_name().to_string(),
"--tensor-parallel-size".to_string(), "--tensor-parallel-size".to_string(),
tensor_parallel_size.to_string(), tensor_parallel_size.to_string(),
"--kv-block-size".to_string(),
dynamo_llm::DEFAULT_KV_BLOCK_SIZE.to_string(),
]; ];
// sglang only // sglang only
if let Some(base_gpu_id) = base_gpu_id { if let Some(base_gpu_id) = base_gpu_id {
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# # `dynamo-run out=sglang` runs this script
# A very basic example of sglang worker handling pre-processed requests. # Can also be used standalone: `python3 sglang_inc.py` - lots of optional cmd line params
#
# Dynamo does the HTTP handling, prompt templating and tokenization, then forwards the
# request via NATS to this python script, which runs sglang.
#
# Setup a virtualenv with dynamo.llm, dynamo.runtime and sglang[all] installed
# in lib/bindings/python `maturin develop` and `pip install -e .` should do it
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python server_sglang.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn://dynamo.backend.generate`
import argparse import argparse
import asyncio import asyncio
...@@ -28,8 +17,9 @@ from sglang.srt.server_args import ServerArgs ...@@ -28,8 +17,9 @@ from sglang.srt.server_args import ServerArgs
from dynamo.llm import ModelType, register_llm from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
...@@ -44,6 +34,7 @@ class Config: ...@@ -44,6 +34,7 @@ class Config:
model_name: Optional[str] model_name: Optional[str]
base_gpu_id: int base_gpu_id: int
tensor_parallel_size: int tensor_parallel_size: int
kv_block_size: int
nnodes: int nnodes: int
node_rank: int node_rank: int
dist_init_addr: str dist_init_addr: str
...@@ -106,6 +97,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -106,6 +97,7 @@ async def init(runtime: DistributedRuntime, config: Config):
"skip_tokenizer_init": True, "skip_tokenizer_init": True,
"tp_size": config.tensor_parallel_size, "tp_size": config.tensor_parallel_size,
"base_gpu_id": config.base_gpu_id, "base_gpu_id": config.base_gpu_id,
"page_size": config.kv_block_size,
} }
if config.dist_init_addr != "": if config.dist_init_addr != "":
arg_map["trust_remote_code"] = True arg_map["trust_remote_code"] = True
...@@ -168,6 +160,9 @@ def cmd_line_args(): ...@@ -168,6 +160,9 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use." "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
) )
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument( parser.add_argument(
"--nnodes", type=int, default=1, help="The number of machines SGLang will use" "--nnodes", type=int, default=1, help="The number of machines SGLang will use"
) )
...@@ -214,6 +209,7 @@ def cmd_line_args(): ...@@ -214,6 +209,7 @@ def cmd_line_args():
config.endpoint = parsed_endpoint_name config.endpoint = parsed_endpoint_name
config.base_gpu_id = args.base_gpu_id config.base_gpu_id = args.base_gpu_id
config.tensor_parallel_size = args.tensor_parallel_size config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.nnodes = args.nnodes config.nnodes = args.nnodes
config.node_rank = args.node_rank config.node_rank = args.node_rank
config.dist_init_addr = args.dist_init_addr config.dist_init_addr = args.dist_init_addr
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# A very basic example of vllm worker handling pre-processed requests. # `dynamo-run out=vllm` runs this script
# # Can also be used standalone: `python3 vllm_inc.py` - lots of optional cmd line params
# Dynamo does the HTTP handling, prompt templating and tokenization, then forwards the
# request via NATS to this python script, which runs vllm. # Setup checklist:
# # - We are in a virtualenv with vllm installed - and patched if using kv routing.
# Setup a virtualenv with dynamo.llm, dynamo.runtime and vllm installed # - `libdynamo_llm_capi.so` is in system lib path or it's containing folder is in LD_LIBRARY_PATH
# in lib/bindings/python `maturin develop` and `pip install -e .` should do it # It builds in target/debug/ by default.
# Start nats and etcd:
# - nats-server -js
#
# Window 1: `python vllm_inc.py`. Wait for log "Starting endpoint".
# Window 2: `dynamo-run out=dyn://dynamo.backend.generate`
import argparse import argparse
import asyncio import asyncio
...@@ -30,11 +25,12 @@ from vllm.entrypoints.openai.api_server import ( ...@@ -30,11 +25,12 @@ from vllm.entrypoints.openai.api_server import (
) )
from vllm.inputs import TokensPrompt from vllm.inputs import TokensPrompt
from dynamo.llm import ModelType, register_llm from dynamo.llm import KvMetricsPublisher, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
# Only used if you run it manually from the command line
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
...@@ -48,6 +44,7 @@ class Config: ...@@ -48,6 +44,7 @@ class Config:
model_path: str model_path: str
model_name: Optional[str] model_name: Optional[str]
tensor_parallel_size: int tensor_parallel_size: int
kv_block_size: int
extra_engine_args: str extra_engine_args: str
...@@ -56,9 +53,37 @@ class RequestHandler: ...@@ -56,9 +53,37 @@ class RequestHandler:
Request handler for the generate endpoint Request handler for the generate endpoint
""" """
def __init__(self, engine, default_sampling_params): def __init__(self, component, engine, default_sampling_params):
self.component = component
self.engine_client = engine self.engine_client = engine
self.default_sampling_params = default_sampling_params self.default_sampling_params = default_sampling_params
self.metrics_publisher = KvMetricsPublisher()
def setup_kv_metrics(self):
if not hasattr(self.engine_client, "set_metrics_publisher"):
logging.debug("VLLM version does not support KV metrics")
return
self.engine_client.set_metrics_publisher(self.metrics_publisher)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
self.metrics_publisher.publish(
0, # request_active_slots
1024, # request_total_slots
0, # kv_active_blocks
1024, # kv_total_blocks
0, # num_requests_waiting
0.0, # gpu_cache_usage_perc
0.0, # gpu_prefix_cache_hit_rate
)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logging.debug("metrics publisher endpoint created")
)
async def create_metrics_publisher_endpoint(self):
logging.debug("Creating metrics publisher endpoint")
await self.metrics_publisher.create_endpoint(self.component)
async def generate(self, request): async def generate(self, request):
# logging.debug(f"Received request: {request}") # logging.debug(f"Received request: {request}")
...@@ -126,6 +151,8 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -126,6 +151,8 @@ async def init(runtime: DistributedRuntime, config: Config):
"tensor_parallel_size": config.tensor_parallel_size, "tensor_parallel_size": config.tensor_parallel_size,
"skip_tokenizer_init": True, "skip_tokenizer_init": True,
"disable_log_requests": True, "disable_log_requests": True,
"enable_prefix_caching": True,
"block_size": config.kv_block_size,
# KV routing relies on logging KV metrics # KV routing relies on logging KV metrics
"disable_log_stats": False, "disable_log_stats": False,
} }
...@@ -142,6 +169,14 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -142,6 +169,14 @@ async def init(runtime: DistributedRuntime, config: Config):
logging.debug(f"Adding extra engine arguments: {json_map}") logging.debug(f"Adding extra engine arguments: {json_map}")
arg_map = {**arg_map, **json_map} # json_map gets precedence arg_map = {**arg_map, **json_map} # json_map gets precedence
# Patch won't start KVCacheEventManager unless these four are set
os.environ["VLLM_WORKER_ID"] = str(endpoint.lease_id())
os.environ[
"VLLM_KV_CAPI_PATH"
] = "libdynamo_llm_capi.so" # Must be on LD_LIBRARY_PATH
os.environ["VLLM_KV_NAMESPACE"] = config.namespace
os.environ["VLLM_KV_COMPONENT"] = config.component
os.environ["VLLM_NO_USAGE_STATS"] = "1" # Avoid internal HTTP requests os.environ["VLLM_NO_USAGE_STATS"] = "1" # Avoid internal HTTP requests
engine_args = AsyncEngineArgs(**arg_map) engine_args = AsyncEngineArgs(**arg_map)
model_config = engine_args.create_model_config() model_config = engine_args.create_model_config()
...@@ -151,11 +186,12 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -151,11 +186,12 @@ async def init(runtime: DistributedRuntime, config: Config):
engine_context = build_async_engine_client_from_engine_args(engine_args) engine_context = build_async_engine_client_from_engine_args(engine_args)
engine_client = await engine_context.__aenter__() engine_client = await engine_context.__aenter__()
handler = RequestHandler(component, engine_client, default_sampling_params)
handler.setup_kv_metrics()
# the server will gracefully shutdown (i.e., keep opened TCP streams finishes) # the server will gracefully shutdown (i.e., keep opened TCP streams finishes)
# after the lease is revoked # after the lease is revoked
await endpoint.serve_endpoint( await endpoint.serve_endpoint(handler.generate)
RequestHandler(engine_client, default_sampling_params).generate
)
def cmd_line_args(): def cmd_line_args():
...@@ -183,6 +219,9 @@ def cmd_line_args(): ...@@ -183,6 +219,9 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use." "--tensor-parallel-size", type=int, default=1, help="Number of GPUs to use."
) )
parser.add_argument(
"--kv-block-size", type=int, default=16, help="Size of a KV cache block."
)
parser.add_argument( parser.add_argument(
"--extra-engine-args", "--extra-engine-args",
type=str, type=str,
...@@ -213,6 +252,7 @@ def cmd_line_args(): ...@@ -213,6 +252,7 @@ def cmd_line_args():
config.component = parsed_component_name config.component = parsed_component_name
config.endpoint = parsed_endpoint_name config.endpoint = parsed_endpoint_name
config.tensor_parallel_size = args.tensor_parallel_size config.tensor_parallel_size = args.tensor_parallel_size
config.kv_block_size = args.kv_block_size
config.extra_engine_args = args.extra_engine_args config.extra_engine_args = args.extra_engine_args
return config return config
......
...@@ -31,7 +31,7 @@ from dynamo.llm import ModelType, register_llm ...@@ -31,7 +31,7 @@ from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
class Config: class Config:
......
...@@ -43,7 +43,7 @@ from dynamo.llm import ModelType, register_llm ...@@ -43,7 +43,7 @@ from dynamo.llm import ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate" DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
class Config: class Config:
......
...@@ -51,7 +51,6 @@ module-name = "dynamo._core" ...@@ -51,7 +51,6 @@ module-name = "dynamo._core"
manifest-path = "Cargo.toml" manifest-path = "Cargo.toml"
python-packages = ["dynamo"] python-packages = ["dynamo"]
python-source = "src" python-source = "src"
features = ["dynamo-llm/block-manager"]
[build-system] [build-system]
requires = ["maturin>=1.0,<2.0", "patchelf"] requires = ["maturin>=1.0,<2.0", "patchelf"]
......
...@@ -37,7 +37,9 @@ impl KvRouter { ...@@ -37,7 +37,9 @@ impl KvRouter {
llm_rs::kv_router::KvRouter::new(component.inner.clone(), kv_block_size, None) llm_rs::kv_router::KvRouter::new(component.inner.clone(), kv_block_size, None)
.await .await
.map_err(to_pyerr)?; .map_err(to_pyerr)?;
Ok(Self { inner }) Ok(Self {
inner: Arc::new(inner),
})
}) })
} }
......
...@@ -20,21 +20,19 @@ use serde::{Deserialize, Serialize}; ...@@ -20,21 +20,19 @@ use serde::{Deserialize, Serialize};
use tokio::sync::mpsc::Receiver; use tokio::sync::mpsc::Receiver;
use dynamo_runtime::{ use dynamo_runtime::{
component::{self, ComponentEndpointInfo}, component::{self, Component, ComponentEndpointInfo},
pipeline::{ pipeline::{
network::egress::push_router::PushRouter, ManyOut, Operator, RouterMode, SegmentSource, network::egress::push_router::PushRouter, ManyOut, Operator,
ServiceBackend, SingleIn, Source, RouterMode as RuntimeRouterMode, SegmentSource, ServiceBackend, SingleIn, Source,
}, },
protocols::{self, annotated::Annotated}, protocols::{self, annotated::Annotated},
slug::Slug, slug::Slug,
traits::DistributedRuntimeProvider as _,
transports::etcd::{self, KeyValue, WatchEvent}, transports::etcd::{self, KeyValue, WatchEvent},
DistributedRuntime, DistributedRuntime,
}; };
use super::ModelManager; use super::ModelManager;
use crate::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
};
use crate::protocols::openai::completions::{CompletionRequest, CompletionResponse}; use crate::protocols::openai::completions::{CompletionRequest, CompletionResponse};
use crate::{ use crate::{
backend::Backend, backend::Backend,
...@@ -46,6 +44,12 @@ use crate::{ ...@@ -46,6 +44,12 @@ use crate::{
key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager}, key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager},
model_card::{self, ModelDeploymentCard}, model_card::{self, ModelDeploymentCard},
}; };
use crate::{
kv_router::{scheduler::DefaultWorkerSelector, KvPushRouter, KvRouter},
protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
},
};
use tracing; use tracing;
/// [ModelEntry] is a struct that contains the information for the HTTP service to discover models /// [ModelEntry] is a struct that contains the information for the HTTP service to discover models
...@@ -161,13 +165,65 @@ impl std::fmt::Display for ModelNetworkName { ...@@ -161,13 +165,65 @@ impl std::fmt::Display for ModelNetworkName {
} }
} }
pub struct ModelWatchState { #[derive(Debug, Clone, Copy, PartialEq)]
pub prefix: String, pub enum LLMRouterMode {
pub manager: ModelManager, Random,
pub drt: DistributedRuntime, RoundRobin,
KV,
}
impl LLMRouterMode {
pub fn is_kv_routing(&self) -> bool {
*self == LLMRouterMode::KV
}
pub fn as_runtime(&self) -> Option<RuntimeRouterMode> {
match self {
LLMRouterMode::RoundRobin => Some(RuntimeRouterMode::RoundRobin),
LLMRouterMode::Random => Some(RuntimeRouterMode::Random),
// Runtime router does not have KV, it's a dynamo-llm thing, not dynamo-runtime
LLMRouterMode::KV => None,
}
}
}
pub struct ModelWatcher {
prefix: String,
manager: ModelManager,
drt: DistributedRuntime,
router_mode: LLMRouterMode,
kv_chooser: Option<Arc<KvRouter>>,
} }
pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver<WatchEvent>) { impl ModelWatcher {
pub async fn new(
component: Component,
model_manager: ModelManager,
network_prefix: &str,
router_mode: LLMRouterMode,
) -> anyhow::Result<ModelWatcher> {
let kv_chooser = if router_mode.is_kv_routing() {
let selector = Box::new(DefaultWorkerSelector {});
let chooser = KvRouter::new(
component.clone(),
crate::DEFAULT_KV_BLOCK_SIZE,
Some(selector),
)
.await?;
Some(Arc::new(chooser))
} else {
None
};
Ok(Self {
prefix: network_prefix.to_string(),
manager: model_manager,
drt: component.drt().clone(),
router_mode,
kv_chooser,
})
}
pub async fn watch(self: Arc<Self>, mut events_rx: Receiver<WatchEvent>) {
tracing::debug!("model watcher started"); tracing::debug!("model watcher started");
while let Some(event) = events_rx.recv().await { while let Some(event) = events_rx.recv().await {
...@@ -180,7 +236,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver< ...@@ -180,7 +236,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver<
continue; continue;
} }
}; };
if state.manager.has_model_any(&model_entry.name) { if self.manager.has_model_any(&model_entry.name) {
tracing::trace!( tracing::trace!(
service_name = model_entry.name, service_name = model_entry.name,
"New endpoint for existing model" "New endpoint for existing model"
...@@ -188,7 +244,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver< ...@@ -188,7 +244,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver<
continue; continue;
} }
match handle_put(&model_entry, state.clone()).await { match self.clone().handle_put(&model_entry).await {
Ok(()) => { Ok(()) => {
tracing::info!(model_name = model_entry.name, "added model"); tracing::info!(model_name = model_entry.name, "added model");
} }
...@@ -197,7 +253,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver< ...@@ -197,7 +253,7 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver<
} }
} }
} }
WatchEvent::Delete(kv) => match handle_delete(&kv, state.clone()).await { WatchEvent::Delete(kv) => match self.clone().handle_delete(&kv).await {
Ok(model_name) => { Ok(model_name) => {
tracing::info!("removed model {}", model_name); tracing::info!("removed model {}", model_name);
} }
...@@ -207,28 +263,28 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver< ...@@ -207,28 +263,28 @@ pub async fn model_watcher(state: Arc<ModelWatchState>, mut events_rx: Receiver<
}, },
} }
} }
} }
async fn handle_delete(kv: &KeyValue, state: Arc<ModelWatchState>) -> anyhow::Result<&str> { async fn handle_delete(self: Arc<Self>, kv: &KeyValue) -> anyhow::Result<&str> {
let key = kv.key_str()?; let key = kv.key_str()?;
tracing::debug!(key, "removing model"); tracing::debug!(key, "removing model");
let model_name = key.trim_start_matches(&state.prefix); let model_name = key.trim_start_matches(&self.prefix);
// Ignore the errors because model could be either type // Ignore the errors because model could be either type
let _ = state.manager.remove_chat_completions_model(model_name); let _ = self.manager.remove_chat_completions_model(model_name);
let _ = state.manager.remove_completions_model(model_name); let _ = self.manager.remove_completions_model(model_name);
Ok(model_name) Ok(model_name)
} }
// Handles a PUT event from etcd, this usually means adding a new model to the list of served // Handles a PUT event from etcd, this usually means adding a new model to the list of served
// models. // models.
// //
// If this method errors, for the near term, we will delete the offending key. // If this method errors, for the near term, we will delete the offending key.
async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> anyhow::Result<()> { async fn handle_put(self: Arc<ModelWatcher>, model_entry: &ModelEntry) -> anyhow::Result<()> {
let endpoint_id = model_entry.endpoint.clone(); let endpoint_id = model_entry.endpoint.clone();
let client = state let client = self
.drt .drt
.namespace(&endpoint_id.namespace)? .namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)? .component(&endpoint_id.component)?
...@@ -236,7 +292,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -236,7 +292,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
.client() .client()
.await?; .await?;
let Some(etcd_client) = state.drt.etcd_client() else { let Some(etcd_client) = self.drt.etcd_client() else {
// Should be impossible because we only get here on an etcd event // Should be impossible because we only get here on an etcd event
anyhow::bail!("Missing etcd_client"); anyhow::bail!("Missing etcd_client");
}; };
...@@ -263,7 +319,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -263,7 +319,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
// This cache_dir is a tempfile::TempDir will be deleted on drop. I _think_ // This cache_dir is a tempfile::TempDir will be deleted on drop. I _think_
// OpenAIPreprocessor::new loads the files, so we can delete them after this // OpenAIPreprocessor::new loads the files, so we can delete them after this
// function. Needs checking carefully, possibly we need to store it in state. // function. Needs checking carefully, possibly we need to store it in state.
let _cache_dir = Some(card.move_from_nats(state.drt.nats_client()).await?); let _cache_dir = Some(card.move_from_nats(self.drt.nats_client()).await?);
let frontend = SegmentSource::< let frontend = SegmentSource::<
SingleIn<NvCreateChatCompletionRequest>, SingleIn<NvCreateChatCompletionRequest>,
...@@ -273,19 +329,30 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -273,19 +329,30 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
let backend = Backend::from_mdc(card.clone()).await?.into_operator(); let backend = Backend::from_mdc(card.clone()).await?.into_operator();
let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client( let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client(
client.clone(), client.clone(),
RouterMode::Random, // TODO how do we configure this? self.router_mode.as_runtime(),
) )
.await?; .await?;
let service_backend = match self.router_mode {
LLMRouterMode::Random | LLMRouterMode::RoundRobin => {
ServiceBackend::from_engine(Arc::new(router))
}
LLMRouterMode::KV => {
let Some(kv_chooser) = self.kv_chooser.clone() else {
anyhow::bail!("KV routing mode with no chooser, should be unreachable");
};
let kv_push_router = KvPushRouter::new(router, kv_chooser);
ServiceBackend::from_engine(Arc::new(kv_push_router))
}
};
let chat_engine = frontend let chat_engine = frontend
.link(preprocessor.forward_edge())? .link(preprocessor.forward_edge())?
.link(backend.forward_edge())? .link(backend.forward_edge())?
.link(ServiceBackend::from_engine(Arc::new(router)))? .link(service_backend)?
.link(backend.backward_edge())? .link(backend.backward_edge())?
.link(preprocessor.backward_edge())? .link(preprocessor.backward_edge())?
.link(frontend)?; .link(frontend)?;
state self.manager
.manager
.add_chat_completions_model(&model_entry.name, chat_engine)?; .add_chat_completions_model(&model_entry.name, chat_engine)?;
let frontend = SegmentSource::< let frontend = SegmentSource::<
...@@ -296,19 +363,30 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -296,19 +363,30 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
let backend = Backend::from_mdc(card.clone()).await?.into_operator(); let backend = Backend::from_mdc(card.clone()).await?.into_operator();
let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client( let router = PushRouter::<BackendInput, Annotated<LLMEngineOutput>>::from_client(
client, client,
RouterMode::Random, // TODO how do we configure this? self.router_mode.as_runtime(),
) )
.await?; .await?;
let service_backend = match self.router_mode {
LLMRouterMode::Random | LLMRouterMode::RoundRobin => {
ServiceBackend::from_engine(Arc::new(router))
}
LLMRouterMode::KV => {
let Some(kv_chooser) = self.kv_chooser.clone() else {
anyhow::bail!("KV routing mode with no chooser, should be unreachable");
};
let kv_push_router = KvPushRouter::new(router, kv_chooser);
ServiceBackend::from_engine(Arc::new(kv_push_router))
}
};
let completions_engine = frontend let completions_engine = frontend
.link(preprocessor.forward_edge())? .link(preprocessor.forward_edge())?
.link(backend.forward_edge())? .link(backend.forward_edge())?
.link(ServiceBackend::from_engine(Arc::new(router)))? .link(service_backend)?
.link(backend.backward_edge())? .link(backend.backward_edge())?
.link(preprocessor.backward_edge())? .link(preprocessor.backward_edge())?
.link(frontend)?; .link(frontend)?;
state self.manager
.manager
.add_completions_model(&model_entry.name, completions_engine)?; .add_completions_model(&model_entry.name, completions_engine)?;
} }
ModelType::Chat => { ModelType::Chat => {
...@@ -318,8 +396,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -318,8 +396,7 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
>::from_client(client, Default::default()) >::from_client(client, Default::default())
.await?; .await?;
let engine = Arc::new(push_router); let engine = Arc::new(push_router);
state self.manager
.manager
.add_chat_completions_model(&model_entry.name, engine)?; .add_chat_completions_model(&model_entry.name, engine)?;
} }
ModelType::Completion => { ModelType::Completion => {
...@@ -330,11 +407,11 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an ...@@ -330,11 +407,11 @@ async fn handle_put(model_entry: &ModelEntry, state: Arc<ModelWatchState>) -> an
) )
.await?; .await?;
let engine = Arc::new(push_router); let engine = Arc::new(push_router);
state self.manager
.manager
.add_completions_model(&model_entry.name, engine)?; .add_completions_model(&model_entry.name, engine)?;
} }
} }
Ok(()) Ok(())
}
} }
...@@ -13,18 +13,19 @@ ...@@ -13,18 +13,19 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
use std::sync::Arc;
use anyhow::Result; use anyhow::Result;
use dynamo_runtime::{ use dynamo_runtime::{
component::Component, component::{Component, EndpointSource},
pipeline::{ pipeline::{
async_trait, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, async_trait, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, PushRouter,
SingleIn, ResponseStream, SingleIn,
}, },
prelude::*, prelude::*,
protocols::annotated::Annotated, protocols::annotated::Annotated,
}; };
use futures::stream::{self, StreamExt}; use futures::stream::{self, StreamExt};
use std::sync::Arc;
pub mod indexer; pub mod indexer;
pub mod metrics_aggregator; pub mod metrics_aggregator;
...@@ -42,11 +43,16 @@ use crate::{ ...@@ -42,11 +43,16 @@ use crate::{
scheduler::{KvScheduler, KvSchedulerError, SchedulingRequest}, scheduler::{KvScheduler, KvSchedulerError, SchedulingRequest},
scoring::ProcessedEndpoints, scoring::ProcessedEndpoints,
}, },
preprocessor::BackendInput,
protocols::common::llm_backend::LLMEngineOutput,
tokens::TokenBlockSequence, tokens::TokenBlockSequence,
}; };
use dynamo_runtime::traits::events::EventSubscriber; use dynamo_runtime::traits::events::EventSubscriber;
// TODO: Allow user to change
pub const DEFAULT_KV_BLOCK_SIZE: usize = 16;
// [gluo TODO] shouldn't need to be public // [gluo TODO] shouldn't need to be public
// this should be discovered from the component // this should be discovered from the component
pub const KV_EVENT_SUBJECT: &str = "kv_events"; pub const KV_EVENT_SUBJECT: &str = "kv_events";
...@@ -63,6 +69,8 @@ pub trait WorkerSelector { ...@@ -63,6 +69,8 @@ pub trait WorkerSelector {
) -> Result<WorkerSelectionResult, KvSchedulerError>; ) -> Result<WorkerSelectionResult, KvSchedulerError>;
} }
/// A KvRouter only decides which worker you should use. It doesn't send you there.
/// TODO: Rename this to indicate it only selects a worker, it does not route.
pub struct KvRouter { pub struct KvRouter {
indexer: KvIndexer, indexer: KvIndexer,
scheduler: KvScheduler, scheduler: KvScheduler,
...@@ -74,7 +82,7 @@ impl KvRouter { ...@@ -74,7 +82,7 @@ impl KvRouter {
component: Component, component: Component,
block_size: usize, block_size: usize,
selector: Option<Box<dyn WorkerSelector + Send + Sync>>, selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
) -> Result<Arc<Self>> { ) -> Result<Self> {
let cancellation_token = component let cancellation_token = component
.drt() .drt()
.primary_lease() .primary_lease()
...@@ -100,10 +108,7 @@ impl KvRouter { ...@@ -100,10 +108,7 @@ impl KvRouter {
tokio::spawn(async move { tokio::spawn(async move {
while let Some(event) = kv_events_rx.next().await { while let Some(event) = kv_events_rx.next().await {
let event: RouterEvent = match serde_json::from_slice(&event.payload) { let event: RouterEvent = match serde_json::from_slice(&event.payload) {
Ok(event) => { Ok(event) => event,
tracing::debug!("received kv event: {:?}", event);
event
}
Err(e) => { Err(e) => {
tracing::warn!("Failed to deserialize RouterEvent: {:?}", e); tracing::warn!("Failed to deserialize RouterEvent: {:?}", e);
// Choosing warn and continue to process other events from other workers // Choosing warn and continue to process other events from other workers
...@@ -112,16 +117,16 @@ impl KvRouter { ...@@ -112,16 +117,16 @@ impl KvRouter {
} }
}; };
if let Err(e) = kv_events_tx.send(event).await { if let Err(e) = kv_events_tx.send(event).await {
tracing::trace!("failed to send kv event to indexer; shutting down: {:?}", e); tracing::debug!("failed to send kv event to indexer; shutting down: {:?}", e);
} }
} }
}); });
Ok(Arc::new(Self { Ok(Self {
scheduler, scheduler,
indexer, indexer,
block_size, block_size,
})) })
} }
// [TODO] indexer needs to take 'lora_id' as parameter // [TODO] indexer needs to take 'lora_id' as parameter
...@@ -137,28 +142,33 @@ impl KvRouter { ...@@ -137,28 +142,33 @@ impl KvRouter {
let worker_id = self.scheduler.schedule(overlap_scores, isl_tokens).await?; let worker_id = self.scheduler.schedule(overlap_scores, isl_tokens).await?;
Ok(worker_id) Ok(worker_id)
} }
}
#[async_trait] /// Give these tokens, find the worker with the best match in it's KV cache.
impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error> for KvRouter { async fn find_best_match(&self, tokens: &[u32]) -> anyhow::Result<i64> {
async fn generate( let isl_tokens = tokens.len();
&self,
request: SingleIn<RouterRequest>,
) -> Result<ManyOut<Annotated<RouterResponse>>> {
let (request, ctx) = request.into_parts();
let isl_tokens = request.tokens.len();
let block_size = self.block_size; let block_size = self.block_size;
let (complete_blocks, _partial_block) = let (complete_blocks, _partial_block) =
TokenBlockSequence::split_tokens(&request.tokens, block_size, 1337_u64); TokenBlockSequence::split_tokens(tokens, block_size, 1337_u64);
let local_block_hashes = complete_blocks let local_block_hashes = complete_blocks
.into_iter() .into_iter()
.map(|block| LocalBlockHash(block.block_hash())) .map(|block| LocalBlockHash(block.block_hash()))
.collect(); .collect();
let overlap_scores = self.indexer.find_matches(local_block_hashes).await?; let overlap_scores = self.indexer.find_matches(local_block_hashes).await?;
let worker_id = self.scheduler.schedule(overlap_scores, isl_tokens).await?; let worker_id = self.scheduler.schedule(overlap_scores, isl_tokens).await?;
Ok(worker_id)
}
}
#[async_trait]
impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Error> for KvRouter {
async fn generate(
&self,
request: SingleIn<RouterRequest>,
) -> Result<ManyOut<Annotated<RouterResponse>>> {
let (request, ctx) = request.into_parts();
let worker_id = self.find_best_match(&request.tokens).await?;
let response = RouterResponse { worker_id }; let response = RouterResponse { worker_id };
let response = Annotated::from_data(response); let response = Annotated::from_data(response);
...@@ -166,3 +176,35 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er ...@@ -166,3 +176,35 @@ impl AsyncEngine<SingleIn<RouterRequest>, ManyOut<Annotated<RouterResponse>>, Er
Ok(ResponseStream::new(Box::pin(stream), ctx.context())) Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
} }
} }
pub struct KvPushRouter {
inner: PushRouter<BackendInput, Annotated<LLMEngineOutput>>,
chooser: Arc<KvRouter>,
}
impl KvPushRouter {
pub fn new(
inner: PushRouter<BackendInput, Annotated<LLMEngineOutput>>,
chooser: Arc<KvRouter>,
) -> Self {
KvPushRouter { inner, chooser }
}
}
#[async_trait]
impl AsyncEngine<SingleIn<BackendInput>, ManyOut<Annotated<LLMEngineOutput>>, Error>
for KvPushRouter
{
async fn generate(
&self,
request: SingleIn<BackendInput>,
) -> Result<ManyOut<Annotated<LLMEngineOutput>>, Error> {
match &self.inner.client.endpoints {
EndpointSource::Static => self.inner.r#static(request).await,
EndpointSource::Dynamic(_) => {
let worker_id = self.chooser.find_best_match(&request.token_ids).await?;
self.inner.direct(request, worker_id).await
}
}
}
}
...@@ -58,7 +58,6 @@ use std::{ ...@@ -58,7 +58,6 @@ use std::{
}; };
use tokio::sync::{broadcast, mpsc, oneshot}; use tokio::sync::{broadcast, mpsc, oneshot};
use tokio_util::sync::CancellationToken; use tokio_util::sync::CancellationToken;
use tracing as log;
use xxhash_rust::xxh3; use xxhash_rust::xxh3;
pub const XXH3_SEED: u64 = 1337; pub const XXH3_SEED: u64 = 1337;
...@@ -160,6 +159,7 @@ impl RouterEvent { ...@@ -160,6 +159,7 @@ impl RouterEvent {
} }
/// A block in the Radix Tree. /// A block in the Radix Tree.
#[derive(Debug)]
struct RadixBlock { struct RadixBlock {
/// A map of child blocks, keyed by their local block hash. /// A map of child blocks, keyed by their local block hash.
children: HashMap<LocalBlockHash, SharedRadixBlock>, children: HashMap<LocalBlockHash, SharedRadixBlock>,
...@@ -245,7 +245,6 @@ impl RadixTree { ...@@ -245,7 +245,6 @@ impl RadixTree {
let current_borrow = current.borrow(); let current_borrow = current.borrow();
current_borrow.children.get(&block_hash).cloned() current_borrow.children.get(&block_hash).cloned()
}; };
if let Some(block) = next_block { if let Some(block) = next_block {
scores.update_scores(&block.borrow().workers); scores.update_scores(&block.borrow().workers);
...@@ -284,7 +283,7 @@ impl RadixTree { ...@@ -284,7 +283,7 @@ impl RadixTree {
pub fn apply_event(&mut self, event: RouterEvent) { pub fn apply_event(&mut self, event: RouterEvent) {
let (worker_id, event) = (event.worker_id, event.event); let (worker_id, event) = (event.worker_id, event.event);
let (id, op) = (event.event_id, event.data); let (id, op) = (event.event_id, event.data);
log::debug!(id, "Store operation: {:?}", op); tracing::trace!(id, "Store operation: {:?}", op);
let worker_lookup = self.lookup.entry(worker_id).or_default(); let worker_lookup = self.lookup.entry(worker_id).or_default();
...@@ -301,7 +300,7 @@ impl RadixTree { ...@@ -301,7 +300,7 @@ impl RadixTree {
let mut current = match current { let mut current = match current {
Some(current) => current.clone(), Some(current) => current.clone(),
None => { None => {
log::warn!( tracing::warn!(
worker_id = worker_id.to_string(), worker_id = worker_id.to_string(),
id, id,
parent_hash = ?op.parent_hash, parent_hash = ?op.parent_hash,
...@@ -344,7 +343,7 @@ impl RadixTree { ...@@ -344,7 +343,7 @@ impl RadixTree {
} }
} }
KvCacheEventData::Removed(remove) => { KvCacheEventData::Removed(remove) => {
// log::trace!(id, "KV Remove Operation: {:?}", op); // tracing::trace!(id, "KV Remove Operation: {:?}", op);
// let mut worker_lookup = self.lookup.get(&worker_id).expect("Worker not found"); // let mut worker_lookup = self.lookup.get(&worker_id).expect("Worker not found");
for block in remove.block_hashes { for block in remove.block_hashes {
...@@ -355,7 +354,7 @@ impl RadixTree { ...@@ -355,7 +354,7 @@ impl RadixTree {
let entry = match worker_lookup.get(&block) { let entry = match worker_lookup.get(&block) {
Some(entry) => entry.clone(), Some(entry) => entry.clone(),
None => { None => {
log::warn!( tracing::warn!(
worker_id = worker_id.to_string(), worker_id = worker_id.to_string(),
id, id,
"Failed to find block to remove; skipping remove operation" "Failed to find block to remove; skipping remove operation"
...@@ -562,7 +561,7 @@ impl KvIndexer { ...@@ -562,7 +561,7 @@ impl KvIndexer {
} }
_ = cancel.cancelled() => { _ = cancel.cancelled() => {
log::debug!("KvCacheIndexer progress loop shutting down"); tracing::debug!("KvCacheIndexer progress loop shutting down");
return; return;
} }
...@@ -576,7 +575,7 @@ impl KvIndexer { ...@@ -576,7 +575,7 @@ impl KvIndexer {
.unwrap() .unwrap()
})); }));
log::debug!("KvCacheIndexer task completed"); tracing::debug!("KvCacheIndexer task completed");
}); });
let once = OnceLock::new(); let once = OnceLock::new();
...@@ -624,7 +623,7 @@ impl KvIndexerInterface for KvIndexer { ...@@ -624,7 +623,7 @@ impl KvIndexerInterface for KvIndexer {
}; };
if let Err(e) = self.match_tx.send(req).await { if let Err(e) = self.match_tx.send(req).await {
log::error!( tracing::error!(
"Failed to send match request: {:?}; the indexer maybe offline", "Failed to send match request: {:?}; the indexer maybe offline",
e e
); );
...@@ -640,13 +639,13 @@ impl KvIndexerInterface for KvIndexer { ...@@ -640,13 +639,13 @@ impl KvIndexerInterface for KvIndexer {
&self, &self,
tokens: &[u32], tokens: &[u32],
) -> Result<OverlapScores, KvRouterError> { ) -> Result<OverlapScores, KvRouterError> {
log::debug!( tracing::debug!(
"Finding matches for request tokens: {:?} / len: {}", "Finding matches for request tokens: {:?} / len: {}",
tokens, tokens,
tokens.len() tokens.len()
); );
let sequence = compute_block_hash_for_seq(tokens, self.kv_block_size); let sequence = compute_block_hash_for_seq(tokens, self.kv_block_size);
log::debug!("Computed sequence: {:?}", sequence); tracing::debug!("Computed sequence: {:?}", sequence);
self.find_matches(sequence).await self.find_matches(sequence).await
} }
...@@ -748,12 +747,12 @@ impl KvIndexerSharded { ...@@ -748,12 +747,12 @@ impl KvIndexerSharded {
Ok(req) = shard_broadcast_rx.recv() => { Ok(req) = shard_broadcast_rx.recv() => {
let matches = trie.find_matches(req.sequence, req.early_exit); let matches = trie.find_matches(req.sequence, req.early_exit);
if let Err(e) = req.resp.send(matches).await { if let Err(e) = req.resp.send(matches).await {
log::trace!("Failed to send match response: {:?}", e); tracing::trace!("Failed to send match response: {:?}", e);
} }
} }
_ = cancel.cancelled() => { _ = cancel.cancelled() => {
log::debug!("KvCacheIndexer progress loop shutting down"); tracing::debug!("KvCacheIndexer progress loop shutting down");
return; return;
} }
...@@ -767,7 +766,7 @@ impl KvIndexerSharded { ...@@ -767,7 +766,7 @@ impl KvIndexerSharded {
.unwrap() .unwrap()
})); }));
log::debug!("KvCacheIndexer task completed"); tracing::debug!("KvCacheIndexer task completed");
})); }));
} }
......
...@@ -70,10 +70,8 @@ pub async fn collect_endpoints( ...@@ -70,10 +70,8 @@ pub async fn collect_endpoints(
.into_endpoints() .into_endpoints()
.filter(|e| e.subject.starts_with(subject)) .filter(|e| e.subject.starts_with(subject))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
tracing::debug!("Endpoints: {endpoints:?}");
if endpoints.is_empty() { if endpoints.is_empty() {
tracing::warn!("No endpoints found matching subject {subject}"); tracing::debug!("Metrics endpoint not visible yet");
} }
Ok(endpoints) Ok(endpoints)
...@@ -92,7 +90,6 @@ pub async fn collect_endpoints_task( ...@@ -92,7 +90,6 @@ pub async fn collect_endpoints_task(
loop { loop {
tokio::select! { tokio::select! {
_ = cancel.cancelled() => { _ = cancel.cancelled() => {
tracing::debug!("cancellation token triggered");
break; break;
} }
_ = tokio::time::sleep(backoff_delay) => { _ = tokio::time::sleep(backoff_delay) => {
...@@ -106,8 +103,6 @@ pub async fn collect_endpoints_task( ...@@ -106,8 +103,6 @@ pub async fn collect_endpoints_task(
continue; continue;
} }
}; };
tracing::debug!("unfiltered endpoints: {:?}", unfiltered_endpoints);
let endpoints: Vec<Endpoint> = unfiltered_endpoints let endpoints: Vec<Endpoint> = unfiltered_endpoints
.into_iter() .into_iter()
.filter(|s| s.data.is_some()) .filter(|s| s.data.is_some())
...@@ -125,13 +120,7 @@ pub async fn collect_endpoints_task( ...@@ -125,13 +120,7 @@ pub async fn collect_endpoints_task(
} }
) )
.collect(); .collect();
tracing::debug!("endpoints: {:?}", endpoints); tracing::trace!("Found {} endpoints for service: {service_subject}", endpoints.len());
tracing::trace!(
"found {} endpoints for service: {}",
endpoints.len(),
service_subject
);
let processed = ProcessedEndpoints::new(endpoints); let processed = ProcessedEndpoints::new(endpoints);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment