Unverified Commit bba70a41 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

feat: standalone KV indexer runtime integration (#7295)

parent 3718da8c
......@@ -462,8 +462,9 @@ impl ModelWatcher {
.kv_chooser_for(
&endpoint,
card.kv_cache_block_size,
Some(self.router_config.kv_router_config),
Some(self.router_config.kv_router_config.clone()),
WORKER_TYPE_DECODE, // This is the decode router
Some(card.display_name.clone()),
)
.await?,
)
......@@ -482,7 +483,7 @@ impl ModelWatcher {
.register_prefill_router(&model_name, &namespace)
.map(|rx| {
// Create prefill-specific config with track_active_blocks disabled
let mut prefill_config = self.router_config.kv_router_config;
let mut prefill_config = self.router_config.kv_router_config.clone();
prefill_config.router_track_active_blocks = false;
PrefillRouter::new(
......
......@@ -38,6 +38,7 @@ pub mod publisher;
pub mod push_router;
pub mod queue;
pub mod recorder;
pub mod remote_indexer;
pub mod scheduler;
pub mod sequence;
pub mod subscriber;
......@@ -58,6 +59,7 @@ use crate::{
RouterResponse, TokensWithHashes, WorkerId, WorkerWithDpRank,
compute_block_hash_for_seq,
},
remote_indexer::RemoteIndexer,
scheduler::{KvScheduler, PotentialLoad},
sequence::{SequenceError, SequenceRequest},
},
......@@ -73,7 +75,7 @@ use std::collections::HashSet;
pub const KV_METRICS_ENDPOINT: &str = "load_metrics";
// for metric publishing (push-based)
pub const KV_EVENT_SUBJECT: &str = "kv-events";
pub use dynamo_kv_router::protocols::KV_EVENT_SUBJECT;
pub const KV_METRICS_SUBJECT: &str = "kv_metrics";
// for inter-router comms
......@@ -84,8 +86,8 @@ pub const ACTIVE_SEQUENCES_SUBJECT: &str = "active_sequences_events";
pub const RADIX_STATE_BUCKET: &str = "radix-bucket";
pub const RADIX_STATE_FILE: &str = "radix-state";
// for standalone indexer query
pub const KV_INDEXER_QUERY_ENDPOINT: &str = "kv_indexer_query";
// for standalone indexer query — re-export from shared crate
pub use dynamo_kv_router::indexer::KV_INDEXER_QUERY_ENDPOINT;
// for worker-local kvindexer query
pub const WORKER_KV_INDEXER_BUFFER_SIZE: usize = 1024; // store 1024 most recent events in worker buffer
......@@ -133,19 +135,40 @@ pub enum Indexer {
/// Does not support TTL/pruning.
Concurrent(Arc<ThreadPoolIndexer<ConcurrentRadixTree>>),
/// Forwards queries to a standalone KV indexer service via the request plane.
/// The standalone indexer manages its own radix tree and event subscription.
Remote(Arc<RemoteIndexer>),
/// Used when we do not wish to use the indexer at all (e.g., when overlap_score_weight is 0).
/// Note: This will cause KV events to accumulate in JetStream as we do not regularly purge them.
None,
}
impl Indexer {
pub fn new(
pub async fn new(
component: &dynamo_runtime::component::Component,
kv_router_config: &KvRouterConfig,
block_size: u32,
) -> Self {
model_name: Option<String>,
) -> Result<Self> {
if kv_router_config.overlap_score_weight == 0.0 {
return Indexer::None;
return Ok(Indexer::None);
}
// Remote indexer: forward queries to a standalone KV indexer service.
if let Some(ref indexer_component_name) = kv_router_config.remote_indexer_component {
let model_name = model_name.ok_or_else(|| {
anyhow::anyhow!(
"model_name is required when remote_indexer_component is configured"
)
})?;
tracing::info!(
remote_indexer_component = %indexer_component_name,
model_name,
"Using remote KV indexer"
);
let remote = RemoteIndexer::new(component, indexer_component_name, model_name).await?;
return Ok(Indexer::Remote(Arc::new(remote)));
}
// Approximate mode (--no-kv-events): always use single-threaded KvIndexer
......@@ -159,33 +182,33 @@ impl Indexer {
max_tree_size: kv_router_config.router_max_tree_size,
prune_target_ratio: kv_router_config.router_prune_target_ratio,
});
return Indexer::KvIndexer(KvIndexer::new_with_frequency(
return Ok(Indexer::KvIndexer(KvIndexer::new_with_frequency(
cancellation_token,
None,
block_size,
kv_indexer_metrics,
prune_config,
));
)));
}
if kv_router_config.router_event_threads > 1 {
return Indexer::Concurrent(Arc::new(ThreadPoolIndexer::new(
return Ok(Indexer::Concurrent(Arc::new(ThreadPoolIndexer::new(
ConcurrentRadixTree::new(),
kv_router_config.router_event_threads as usize,
block_size,
)));
))));
}
let kv_indexer_metrics = indexer::KvIndexerMetrics::from_component(component);
let cancellation_token = component.drt().primary_token();
Indexer::KvIndexer(KvIndexer::new_with_frequency(
Ok(Indexer::KvIndexer(KvIndexer::new_with_frequency(
cancellation_token,
None, // expiration_duration for frequency tracking
block_size,
kv_indexer_metrics,
None,
))
)))
}
pub(crate) async fn find_matches(
......@@ -195,6 +218,10 @@ impl Indexer {
match self {
Indexer::KvIndexer(indexer) => indexer.find_matches(sequence).await,
Indexer::Concurrent(tpi) => tpi.find_matches(sequence).await,
Indexer::Remote(remote) => remote.find_matches(sequence).await.map_err(|e| {
tracing::warn!(error = %e, "Remote indexer query failed");
KvRouterError::IndexerOffline
}),
Indexer::None => Ok(OverlapScores::new()),
}
}
......@@ -203,6 +230,7 @@ impl Indexer {
match self {
Indexer::KvIndexer(indexer) => indexer.dump_events().await,
Indexer::Concurrent(tpi) => tpi.dump_events().await,
Indexer::Remote(_) => Ok(Vec::new()),
Indexer::None => {
panic!(
"Cannot dump events: indexer does not exist (is overlap_score_weight set to 0?)"
......@@ -226,6 +254,7 @@ impl Indexer {
tpi.process_routing_decision_for_request(tokens_with_hashes, worker)
.await
}
Indexer::Remote(_) => Ok(()),
Indexer::None => Ok(()),
}
}
......@@ -238,6 +267,7 @@ impl Indexer {
}
}
Indexer::Concurrent(tpi) => tpi.apply_event(event).await,
Indexer::Remote(_) => {} // standalone indexer gets events directly
Indexer::None => {}
}
}
......@@ -252,6 +282,7 @@ impl Indexer {
Indexer::Concurrent(tpi) => {
KvIndexerInterface::remove_worker(tpi.as_ref(), worker_id).await;
}
Indexer::Remote(_) => {} // standalone indexer manages its own workers
Indexer::None => {}
}
}
......@@ -268,6 +299,7 @@ impl Indexer {
resp_rx.await.unwrap_or_default()
}
Indexer::Concurrent(tpi) => tpi.backend().get_workers(),
Indexer::Remote(_) => Vec::new(),
Indexer::None => Vec::new(),
}
}
......@@ -285,6 +317,7 @@ pub struct KvRouter {
}
impl KvRouter {
#[allow(clippy::too_many_arguments)]
pub async fn new(
endpoint: Endpoint,
client: Client,
......@@ -293,13 +326,14 @@ impl KvRouter {
selector: Option<Box<WorkerSelector>>,
kv_router_config: Option<KvRouterConfig>,
worker_type: &'static str,
model_name: Option<String>,
) -> Result<Self> {
let kv_router_config = kv_router_config.unwrap_or_default();
kv_router_config.validate()?;
let component = endpoint.component();
let cancellation_token = component.drt().primary_token();
let indexer = Indexer::new(component, &kv_router_config, block_size);
let indexer = Indexer::new(component, &kv_router_config, block_size, model_name).await?;
// Wait for at least one worker with a known runtime config before starting scheduler
let _ = workers_with_configs
......@@ -319,8 +353,11 @@ impl KvRouter {
)
.await?;
// Start KV event subscription if needed (use_kv_events=true and overlap_score_weight>0)
if kv_router_config.should_subscribe_to_kv_events() {
// Start KV event subscription if needed — skip when using a remote indexer
// (the standalone indexer handles its own event subscription).
if kv_router_config.remote_indexer_component.is_some() {
tracing::info!("Skipping KV event subscription (using remote indexer)");
} else if kv_router_config.should_subscribe_to_kv_events() {
subscriber::start_subscriber(component.clone(), &kv_router_config, indexer.clone())
.await?;
} else {
......
......@@ -218,6 +218,7 @@ impl PrefillRouter {
kv_cache_block_size,
kv_router_config,
WORKER_TYPE_PREFILL,
Some(self.model_name.clone()),
)
.await?;
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use anyhow::Result;
use futures::StreamExt;
use dynamo_runtime::{
component::Component,
pipeline::{ManyOut, RouterMode, SingleIn, network::egress::push_router::PushRouter},
};
use dynamo_kv_router::{
indexer::{IndexerQueryRequest, IndexerQueryResponse, KV_INDEXER_QUERY_ENDPOINT},
protocols::{LocalBlockHash, OverlapScores},
};
/// A remote indexer that queries a standalone KV indexer via the request plane.
///
/// Used by the frontend when `remote_indexer_component` is configured. Instead of
/// maintaining a local radix tree, this forwards `find_matches` queries to the
/// standalone indexer service over the Dynamo request plane.
pub struct RemoteIndexer {
router: PushRouter<IndexerQueryRequest, IndexerQueryResponse>,
model_name: String,
namespace: String,
}
impl RemoteIndexer {
pub async fn new(
component: &Component,
indexer_component_name: &str,
model_name: String,
) -> Result<Self> {
let namespace = component.namespace().name();
let indexer_ns = component.namespace();
let indexer_component = indexer_ns.component(indexer_component_name)?;
let endpoint = indexer_component.endpoint(KV_INDEXER_QUERY_ENDPOINT);
let client = endpoint.client().await?;
let router =
PushRouter::from_client_no_fault_detection(client, RouterMode::RoundRobin).await?;
Ok(Self {
router,
model_name,
namespace,
})
}
pub async fn find_matches(&self, block_hashes: Vec<LocalBlockHash>) -> Result<OverlapScores> {
let request = IndexerQueryRequest {
model_name: self.model_name.clone(),
namespace: self.namespace.clone(),
block_hashes,
};
let mut stream: ManyOut<IndexerQueryResponse> =
self.router.round_robin(SingleIn::new(request)).await?;
match stream.next().await {
Some(IndexerQueryResponse::Scores(scores)) => Ok(scores.into()),
Some(IndexerQueryResponse::Error(msg)) => {
Err(anyhow::anyhow!("Remote indexer error: {}", msg))
}
None => Err(anyhow::anyhow!("Remote indexer returned empty response")),
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment