feat(kv-router): split Dynamo-native remote indexer [DYN-2593] (#7973)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(kv-router): split Dynamo-native remote indexer [DYN-2593] (#7973)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
49eb397a · Yan Ru Pei · GitHub · d232b450 · 49eb397a · 49eb397a
Unverified Commit 49eb397a authored Apr 07, 2026 by Yan Ru Pei Committed by GitHub Apr 08, 2026
20 changed files
--- a/components/src/dynamo/common/configuration/groups/kv_router_args.py
+++ b/components/src/dynamo/common/configuration/groups/kv_router_args.py
@@ -36,7 +36,8 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
    "router_queue_threshold",
    "router_event_threads",
    "router_queue_policy",
-    "remote_indexer_component",
+    "use_remote_indexer",
+    "serve_indexer",
 )
@@ -61,7 +62,8 @@ class KvRouterConfigBase(ConfigBase):
    router_queue_threshold: Optional[float]
    router_event_threads: int
    router_queue_policy: str
-    remote_indexer_component: Optional[str]
+    use_remote_indexer: bool = False
+    serve_indexer: bool = False
    def kv_router_kwargs(self) -> dict:
        """Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
@@ -286,15 +288,14 @@ class KvRouterArgGroup(ArgGroup):
            arg_type=str,
            choices=["fcfs", "wspt"],
        )
-        add_argument(
+        add_negatable_bool_argument(
            g,
-            flag_name="--remote-indexer-component",
+            flag_name="--use-remote-indexer",
-            env_var="DYN_REMOTE_INDEXER_COMPONENT",
+            env_var="DYN_USE_REMOTE_INDEXER",
-            default=None,
+            default=False,
            help=(
-                "[EXPERIMENTAL] KV Router: Component name of a standalone KV indexer to use for overlap scoring. "
+                "[EXPERIMENTAL] KV Router: Query a remote KV indexer served from the worker "
-                "When set, the router queries the standalone indexer via the request plane instead "
+                "component via the request plane instead of maintaining a local radix tree."
-                "of maintaining a local radix tree (e.g. 'kv-indexer')."
            ),
-            arg_type=str,
+            dest="use_remote_indexer",
        )
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -130,6 +130,13 @@ class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
                    "--router-prefill-load-model=aic requires "
                    "--router-track-prefill-tokens"
                )
+        if self.serve_indexer:
+            if self.router_mode != "kv":
+                raise ValueError("--serve-indexer requires --router-mode=kv")
+            if self.use_remote_indexer:
+                raise ValueError(
+                    "--serve-indexer and --use-remote-indexer are mutually exclusive"
+                )
 @register_encoder(FrontendConfig)
@@ -193,6 +200,14 @@ class FrontendArgGroup(ArgGroup):
            help="HTTP port for the engine (u16).",
            arg_type=int,
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--serve-indexer",
+            env_var="DYN_SERVE_INDEXER",
+            default=False,
+            help="Serve this frontend's local KV indexers over the request plane.",
+            dest="serve_indexer",
+        )
        add_argument(
            g,
            flag_name="--tls-cert-path",

--- a/components/src/dynamo/router/args.py
+++ b/components/src/dynamo/router/args.py
@@ -15,7 +15,7 @@ from dynamo.common.configuration.groups.kv_router_args import (
    KvRouterArgGroup,
    KvRouterConfigBase,
 )
-from dynamo.common.configuration.utils import add_argument
+from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
 from dynamo.llm import AicPerfConfig, KvRouterConfig
@@ -25,6 +25,7 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
    namespace: str
    endpoint: str
    router_block_size: int
+    serve_indexer: bool = False
    def validate(self) -> None:
        """Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
@@ -40,6 +41,10 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
                "Expected format: namespace.component.endpoint"
            )
        self.namespace = parts[0]
+        if self.serve_indexer and self.use_remote_indexer:
+            raise ValueError(
+                "--serve-indexer and --use-remote-indexer are mutually exclusive"
+            )
        if self.router_prefill_load_model == "aic":
            missing = [
                flag
@@ -89,6 +94,15 @@ class DynamoRouterArgGroup(ArgGroup):
            obsolete_flag="--block-size",
        )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--serve-indexer",
+            env_var="DYN_SERVE_INDEXER",
+            default=False,
+            help="Serve this router's local KV indexer over the request plane.",
+            dest="serve_indexer",
+        )
        # KV router options (shared with dynamo.frontend)
        KvRouterArgGroup().add_arguments(parser)
        AicPerfArgGroup().add_arguments(parser)

--- a/container/templates/wheel_builder.Dockerfile
+++ b/container/templates/wheel_builder.Dockerfile
@@ -439,9 +439,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
    uv build --wheel --out-dir /opt/dynamo/dist && \
    cd /opt/dynamo/lib/bindings/python && \
    if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \
-        maturin build --release --features "media-ffmpeg,kv-indexer,kv-indexer-runtime" --out /opt/dynamo/dist; \
+        maturin build --release --features "media-ffmpeg,kv-indexer" --out /opt/dynamo/dist; \
    else \
-        maturin build --release --features "kv-indexer,kv-indexer-runtime" --out /opt/dynamo/dist; \
+        maturin build --release --features "kv-indexer" --out /opt/dynamo/dist; \
    fi && \
    /tmp/use-sccache.sh show-stats "Dynamo Runtime"

--- a/docs/components/router/router-guide.md
+++ b/docs/components/router/router-guide.md
@@ -42,7 +42,7 @@ When using KV routing, the router needs to know what each worker has cached. The
 |------------|---------------|-------------|
 | **NATS Core (local indexer)** | Default (no extra flags) | Workers maintain a local indexer; router queries workers on startup and receives events via NATS Core |
 | **JetStream (durable)** | `--router-durable-kv-events` | Events persisted in NATS JetStream; supports snapshots and durable consumers. *Deprecated.* |
-| **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; standalone indexer aggregates events |
+| **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; the standalone `dynamo.indexer` service aggregates events |
 | **Approximate (no events)** | `--no-router-kv-events` | No events consumed; router predicts cache state from its own routing decisions with TTL-based expiration |
 ### Aggregated vs. Disaggregated Topology
@@ -93,6 +93,8 @@ Backend workers register themselves using the `register_model` API, after which
 | `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration |
 | `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
 | `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
+| `--serve-indexer` | `false` | Serve the Dynamo-native remote indexer from this frontend/router on the worker component |
+| `--use-remote-indexer` | `false` | Query the worker component's served remote indexer instead of maintaining a local overlap indexer |
 For all available options: `python -m dynamo.frontend --help`
@@ -444,6 +446,63 @@ graph TD
 For improved fault tolerance, you can launch multiple frontend + router replicas. If multiple `dynamo.frontend` processes share the same host or network namespace, give each instance a different HTTP port. In Kubernetes or on separate hosts, replicas can usually reuse the same container port. Alternatively, you can deploy the router separately as the standalone `python -m dynamo.router` service; see the [Standalone Router README](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/router/README.md).
+### Dynamo-Native Remote Indexer
+For Dynamo-native deployments, the remote indexer is served by `dynamo.frontend` or `dynamo.router`, not by `dynamo.indexer`.
+- Use `--serve-indexer` on router/frontend replicas that should expose `kv_indexer_query` from the worker component.
+- Use `--use-remote-indexer` on consumer routers/frontends that should query that served endpoint instead of maintaining a local overlap indexer.
+- `dynamo.indexer` remains the standalone HTTP + ZMQ microservice for non-Dynamo / direct-ZMQ deployments.
+Frontend example:
+```bash
+# Serving anchors
+python -m dynamo.frontend --router-mode kv --serve-indexer
+# Consumer frontend
+python -m dynamo.frontend --router-mode kv --use-remote-indexer
+```
+The served service is request-plane only. Each serving router/frontend keeps its normal local KV event ingestion, gap detection, and worker-query recovery path; remote consumers only issue hash-based overlap queries.
+Approximate mode (`--no-router-kv-events`) is singleton-only for remote serving: only one `--serve-indexer` replica may exist for a given worker component. Event-driven mode allows multiple serving replicas behind the same worker component.
+```mermaid
+graph TD
+    subgraph "Workers"
+        W1["Worker 1"]
+        W2["Worker 2"]
+    end
+    subgraph "Event Plane"
+        EP["KV Events"]
+    end
+    subgraph "Serving Routers / Frontends"
+        S1["Router / Frontend A<br/>--serve-indexer"]
+        S2["Router / Frontend B<br/>--serve-indexer"]
+        I1["Local Indexer"]
+        I2["Local Indexer"]
+    end
+    subgraph "Request Plane"
+        RP["backend.kv_indexer_query"]
+    end
+    C["Consumer Router / Frontend<br/>--use-remote-indexer"]
+    W1 --> EP
+    W2 --> EP
+    EP --> S1
+    EP --> S2
+    S1 --> I1
+    S2 --> I2
+    C --> RP
+    RP --> S1
+    RP --> S2
+```
 ### Router State Management
 The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details):

--- a/docs/components/router/standalone-indexer.md
+++ b/docs/components/router/standalone-indexer.md
@@ -7,13 +7,16 @@ subtitle: Run the KV cache indexer as an independent HTTP service for querying b
 ## Overview
-The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers. It supports two operational modes:
+The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers.
- **Standalone mode** (default): subscribes to ZMQ KV event streams directly from workers. No Dynamo runtime discovery, registration, or event-plane integration required.
+- It subscribes to ZMQ KV event streams directly from workers.
- **Dynamo runtime mode** (`--dynamo-runtime`): integrates with the Dynamo runtime for automatic worker discovery via MDC, KV event ingestion via the event plane (NATS or ZMQ), and overlap queries over the request plane for remote frontends.
+- It exposes an HTTP API for registration, inspection, and overlap queries.
+- It preserves P2P recovery and gap detection/replay for the standalone ZMQ path.
 This is distinct from the [Standalone Router](../../../components/src/dynamo/router/README.md), which is a full routing service. The standalone indexer provides only the indexing and query layer without routing logic.
+For Dynamo-native remote indexing, use `--serve-indexer` on `dynamo.frontend` or `dynamo.router` and `--use-remote-indexer` on consumers instead. That request-plane service reuses the router's existing event ingestion and recovery machinery; it is not implemented by `dynamo.indexer`.
 The HTTP API follows the [Mooncake KV Indexer RFC](https://github.com/kvcache-ai/Mooncake/issues/1403) conventions.
 `DYN_ROUTER_MIN_INITIAL_WORKERS` is also honored here. When set to a positive integer, the
@@ -30,9 +33,7 @@ The indexer maintains one radix tree per `(model_name, tenant_id)` pair. Workers
 ## Compatibility
-In standalone mode, the indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required.
+The standalone indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required.
-In Dynamo runtime mode, the indexer discovers workers automatically via MDC and receives KV events through the event plane. It also registers a query endpoint on the request plane, allowing frontends to query overlap scores remotely without needing direct HTTP access.
 ## Use Cases
@@ -40,7 +41,7 @@ In Dynamo runtime mode, the indexer discovers workers automatically via MDC and
 - **State verification**: Confirm that the indexer's view of KV cache state matches the router's internal state (used in integration tests).
 - **Custom routing**: Build external routing logic that queries the indexer for overlap scores and makes its own worker selection decisions.
 - **Monitoring**: Observe KV cache distribution across workers without running a full router.
- **Remote indexing**: In Dynamo runtime mode, frontends can offload KV cache indexing to a dedicated service and query it over the request plane.
+- **Standalone microservice**: Run an indexer independently of the router/frontend when you want direct HTTP inspection and ZMQ-based ingestion.
 ## P2P Recovery
@@ -91,7 +92,6 @@ The service is exposed through the Python bindings package and launched with `py
 |---------|-------------|
 | `kv-indexer` | Core standalone indexer service path (`python -m dynamo.indexer`: HTTP API, ZMQ listeners, P2P recovery) |
 | `kv-indexer-metrics` | Optional `/metrics` endpoint |
-| `kv-indexer-runtime` | Dynamo runtime integration (`--dynamo-runtime`, discovery, event plane, request plane) |
 ### Standalone build
@@ -109,30 +109,12 @@ cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develo
 This keeps the default `kv-indexer` build lean while still allowing Prometheus metrics when needed.
-### Runtime-enabled build
-```bash
-cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develop --uv --features kv-indexer,kv-indexer-runtime
-```
-This enables the `--dynamo-runtime` CLI flag for MDC discovery, event-plane subscription, and request-plane queries. It also includes the metrics endpoint.
 ## CLI
-### Standalone mode (default)
 ```bash
 python -m dynamo.indexer --port 8090 [--threads 4] [--block-size 16 --model-name my-model --tenant-id default --workers "1=tcp://host:5557,2:1=tcp://host:5558"] [--peers "http://peer1:8090,http://peer2:8091"]
 ```
-### Dynamo runtime mode
-```bash
-python -m dynamo.indexer --dynamo-runtime --namespace default --component-name kv-indexer --worker-component backend --port 8090 [--threads 4]
-```
-In runtime mode, workers are discovered automatically via MDC. The `--workers` flag can still be used to register additional static workers alongside discovered ones.
 | Flag | Default | Description |
 |------|---------|-------------|
 | `--block-size` | (none) | KV cache block size for initial `--workers` (required when `--workers` is set) |
@@ -142,10 +124,6 @@ In runtime mode, workers are discovered automatically via MDC. The `--workers` f
 | `--model-name` | `default` | Model name for initial `--workers` |
 | `--tenant-id` | `default` | Tenant ID for initial `--workers` |
 | `--peers` | (none) | Comma-separated peer indexer URLs for P2P recovery on startup |
-| `--dynamo-runtime` | `false` | Enable Dynamo runtime integration (requires `kv-indexer-runtime`) |
-| `--namespace` | `default` | Dynamo namespace to register the indexer component under |
-| `--component-name` | `kv-indexer` | Component name for this indexer in the Dynamo runtime |
-| `--worker-component` | `backend` | Component name that workers register under for event-plane subscription |
 ### Shared Startup Gate
@@ -165,7 +143,7 @@ curl http://localhost:8090/health
 ### `GET /metrics` — Prometheus metrics
-Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` or `kv-indexer-runtime` feature.
+Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` feature.
 ```bash
 curl http://localhost:8090/metrics
@@ -400,38 +378,9 @@ If no `replay_endpoint` is configured, gaps are logged as warnings but not recov
 The sequence counter (`last_seq`) persists across unregister/register cycles, so re-registering a worker after a gap will trigger replay on the first batch received by the new listener.
-## Dynamo Runtime Mode
-When started with `--dynamo-runtime`, the indexer integrates with the Dynamo distributed runtime:
-### Worker Discovery
-The indexer watches MDC (Model Discovery Catalog) for worker additions and removals. When a worker registers with MDC, the indexer automatically creates an indexer for its model and block size. Workers discovered via MDC are tracked separately from those registered via `--workers` or the `/register` HTTP API; a worker cannot be registered through both paths simultaneously.
-### Event Plane Subscription
-Instead of connecting directly to ZMQ PUB sockets on each worker, the indexer subscribes to KV events through the Dynamo event plane. The transport (NATS or ZMQ) is determined by the `DYNAMO_EVENT_TRANSPORT` environment variable. Events are routed to the appropriate indexer based on the worker ID.
-### Request Plane Query Endpoint
-The indexer registers a query endpoint on the Dynamo request plane, allowing frontends to send `IndexerQueryRequest` messages containing a model name, namespace, and block hashes. The indexer looks up the appropriate radix tree and returns overlap scores. This enables frontends to use a remote indexer for KV-aware routing without direct HTTP access.
-### Example
-```bash
-# Start the indexer with runtime integration
-python -m dynamo.indexer --dynamo-runtime \
-  --namespace my-namespace \
-  --component-name kv-indexer \
-  --worker-component backend \
-  --port 8090 --threads 4
-```
-The HTTP API remains fully available in runtime mode. Static workers can be added via `--workers` alongside discovered workers.
 ## Limitations
- **Standalone mode is ZMQ only**: In standalone mode, workers must publish KV events via ZMQ PUB sockets. Build with `kv-indexer-runtime` and use `--dynamo-runtime` to receive events via the event plane (NATS or ZMQ).
+- **Standalone mode is ZMQ only**: Workers must publish KV events via ZMQ PUB sockets.
 - **No routing logic**: The indexer only maintains the radix tree and answers queries. It does not track active blocks, manage request lifecycle, or perform worker selection.
 ## Architecture
@@ -471,62 +420,6 @@ graph TD
    style CLIENT fill:#fff3e0,stroke:#333,color:#333
 ```
-### Dynamo Runtime Mode
-```mermaid
-graph TD
-    subgraph Workers
-        W1[Worker 1]
-        W2[Worker 2]
-    end
-    subgraph "Dynamo Runtime"
-        MDC[MDC Discovery]
-        EP[Event Plane<br/>NATS / ZMQ]
-        RP[Request Plane]
-    end
-    subgraph "Standalone Indexer"
-        DISC[Discovery Watcher]
-        SUB[Event Subscriber]
-        REG[Worker Registry]
-        IDX["Indexer Map<br/>(model, tenant) → Radix Tree"]
-        QE[Query Endpoint]
-        HTTP[HTTP API<br/>/query /dump /register /metrics]
-    end
-    FRONTEND[Frontend / Router]
-    CLIENT[External Client]
-    W1 -->|register| MDC
-    W2 -->|register| MDC
-    MDC -->|added/removed| DISC
-    DISC -->|add/remove workers| REG
-    W1 -->|KV events| EP
-    W2 -->|KV events| EP
-    EP -->|RouterEvent| SUB
-    SUB -->|apply events| IDX
-    FRONTEND -->|IndexerQueryRequest| RP
-    RP --> QE
-    QE -->|query| IDX
-    CLIENT -->|POST /query, GET /dump| HTTP
-    HTTP -->|query| IDX
-    style W1 fill:#f3e5f5,stroke:#333,color:#333
-    style W2 fill:#f3e5f5,stroke:#333,color:#333
-    style MDC fill:#e3f2fd,stroke:#333,color:#333
-    style EP fill:#e3f2fd,stroke:#333,color:#333
-    style RP fill:#e3f2fd,stroke:#333,color:#333
-    style IDX fill:#2e8b57,stroke:#333,color:#fff
-    style SUB fill:#2e8b57,stroke:#333,color:#fff
-    style DISC fill:#2e8b57,stroke:#333,color:#fff
-    style REG fill:#2e8b57,stroke:#333,color:#fff
-    style QE fill:#2e8b57,stroke:#333,color:#fff
-    style HTTP fill:#2e8b57,stroke:#333,color:#fff
-    style FRONTEND fill:#fff3e0,stroke:#333,color:#333
-    style CLIENT fill:#fff3e0,stroke:#333,color:#333
-```
 ### P2P Recovery Flow
 ```mermaid

--- a/lib/bindings/python/Cargo.toml
+++ b/lib/bindings/python/Cargo.toml
@@ -25,7 +25,6 @@ crate-type = ["cdylib", "rlib"]
 default = []
 media-ffmpeg = ["dynamo-llm/media-ffmpeg"]
 kv-indexer = ["dep:clap", "dep:tracing-subscriber"]
-kv-indexer-runtime = ["kv-indexer", "dynamo-kv-router/indexer-runtime"]
 kv-indexer-metrics = ["kv-indexer", "dynamo-kv-router/metrics"]
 nvtx = ["dynamo-runtime/nvtx"]

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -126,7 +126,7 @@ impl AicPerfConfig {
 #[pymethods]
 impl KvRouterConfig {
    #[new]
-    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", remote_indexer_component=None))]
+    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", use_remote_indexer=false, serve_indexer=false))]
    #[allow(clippy::too_many_arguments)]
    fn new(
        overlap_score_weight: f64,
@@ -147,7 +147,8 @@ impl KvRouterConfig {
        router_queue_threshold: Option<f64>,
        router_event_threads: u32,
        router_queue_policy: &str,
-        remote_indexer_component: Option<String>,
+        use_remote_indexer: bool,
+        serve_indexer: bool,
    ) -> Self {
        KvRouterConfig {
            inner: RsKvRouterConfig {
@@ -176,7 +177,8 @@ impl KvRouterConfig {
                router_queue_policy: router_queue_policy.parse().unwrap_or_else(|_| {
                    panic!("invalid router_queue_policy: {router_queue_policy:?}")
                }),
-                remote_indexer_component,
+                use_remote_indexer,
+                serve_indexer,
            },
        }
    }

--- a/lib/bindings/python/rust/llm/kv.rs
+++ b/lib/bindings/python/rust/llm/kv.rs
@@ -16,8 +16,6 @@ use clap::Parser;
 use dynamo_kv_router::config::{KvRouterConfig, RouterConfigOverride};
 use dynamo_kv_router::protocols::compute_block_hash_for_seq;
 use dynamo_kv_router::protocols::*;
-#[cfg(feature = "kv-indexer-runtime")]
-use dynamo_kv_router::standalone_indexer::RuntimeConfig;
 #[cfg(feature = "kv-indexer")]
 use dynamo_kv_router::standalone_indexer::{self, IndexerConfig};
 use rs::pipeline::{AsyncEngine, SingleIn};
@@ -71,26 +69,6 @@ struct KvIndexerCli {
    /// Comma-separated peer URLs for P2P recovery (e.g. "http://host1:8090,http://host2:8091")
    #[arg(long)]
    peers: Option<String>,
-    /// Enable Dynamo runtime integration (discovery, event plane, request plane).
-    #[cfg(feature = "kv-indexer-runtime")]
-    #[arg(long)]
-    dynamo_runtime: bool,
-    /// Dynamo namespace to register the indexer component under.
-    #[cfg(feature = "kv-indexer-runtime")]
-    #[arg(long, default_value = "default")]
-    namespace: String,
-    /// Component name for this indexer in the Dynamo runtime.
-    #[cfg(feature = "kv-indexer-runtime")]
-    #[arg(long, default_value = "kv-indexer")]
-    component_name: String,
-    /// Component name that workers register under.
-    #[cfg(feature = "kv-indexer-runtime")]
-    #[arg(long, default_value = "backend")]
-    worker_component: String,
 }
 pub fn run_kv_indexer_cli<I, T>(args: I) -> anyhow::Result<()>
@@ -105,31 +83,6 @@ where
                .chain(args.into_iter().map(Into::into)),
        )?;
-        #[cfg(feature = "kv-indexer-runtime")]
-        if cli.dynamo_runtime {
-            dynamo_runtime::logging::init();
-            let worker = dynamo_runtime::Worker::from_settings()?;
-            return worker.execute(move |runtime| {
-                standalone_indexer::run_with_runtime(
-                    runtime,
-                    IndexerConfig {
-                        block_size: cli.block_size,
-                        port: cli.port,
-                        threads: cli.threads,
-                        workers: cli.workers,
-                        model_name: cli.model_name,
-                        tenant_id: cli.tenant_id,
-                        peers: cli.peers,
-                    },
-                    RuntimeConfig {
-                        namespace: cli.namespace,
-                        component_name: cli.component_name,
-                        worker_component: cli.worker_component,
-                    },
-                )
-            });
-        }
        init_standalone_logging();
        let rt = tokio::runtime::Runtime::new()?;
@@ -732,11 +685,11 @@ async fn create_kv_router_from_endpoint(
        llm_rs::discovery::WORKER_TYPE_DECODE
    };
-    // Query discovery once so we can derive both model_name (for remote indexer)
+    // Query discovery once so we can derive both model_name (for remote/served indexer)
    // and Eagle routing semantics from the model card.
    let needs_model_name = kv_router_config
        .as_ref()
-        .map(|cfg| cfg.remote_indexer_component.is_some())
+        .map(|cfg| cfg.use_remote_indexer || cfg.serve_indexer)
        .unwrap_or(false);
    let (model_name, enable_eagle) = {
        let discovery = endpoint.inner.component().drt().discovery();

--- a/lib/bindings/python/src/dynamo/prometheus_names.py
+++ b/lib/bindings/python/src/dynamo/prometheus_names.py
@@ -287,6 +287,10 @@ class router:
    # Total number of requests processed by the router
    REQUESTS_TOTAL = "router_requests_total"
+    # Total number of remote indexer overlap queries that failed
+    REMOTE_INDEXER_QUERY_FAILURES_TOTAL = "router_remote_indexer_query_failures_total"
+    # Total number of remote indexer routing-decision writes that failed
+    REMOTE_INDEXER_WRITE_FAILURES_TOTAL = "router_remote_indexer_write_failures_total"
    # Time to first token observed at the router (seconds)
    TIME_TO_FIRST_TOKEN_SECONDS = "router_time_to_first_token_seconds"
    # Average inter-token latency observed at the router (seconds)

--- a/lib/bindings/python/tests/replay/replay_utils.py
+++ b/lib/bindings/python/tests/replay/replay_utils.py
@@ -86,7 +86,7 @@ def _router_config_payload():
        "router_prune_target_ratio": 0.8,
        "router_enable_cache_control": False,
        "skip_initial_worker_wait": False,
-        "remote_indexer_component": None,
+        "use_remote_indexer": False,
    }

--- a/lib/kv-router/Cargo.toml
+++ b/lib/kv-router/Cargo.toml
@@ -18,7 +18,6 @@ metrics = ["dep:prometheus"]
 runtime-protocols = ["dep:dynamo-runtime"]
 bench = []
 standalone-indexer = ["dep:axum", "dep:serde_json", "dep:reqwest", "dep:zmq"]
-indexer-runtime = ["metrics", "runtime-protocols", "standalone-indexer"]
 [dependencies]
 # repo

--- a/lib/kv-router/src/indexer/kv_indexer.rs
+++ b/lib/kv-router/src/indexer/kv_indexer.rs
@@ -510,7 +510,7 @@ impl KvIndexerInterface for KvIndexer {
        let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec();
        let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec();
-        self.process_routing_decision_internal(worker, local_hashes, sequence_hashes)
+        self.process_routing_decision_with_hashes(worker, local_hashes, sequence_hashes)
            .await
    }
    async fn flush(&self) -> usize {
@@ -526,8 +526,8 @@ impl KvIndexerInterface for KvIndexer {
 }
 impl KvIndexer {
-    /// Internal method to process a routing decision with pre-computed hashes.
+    /// Process a routing decision with pre-computed hashes.
-    async fn process_routing_decision_internal(
+    pub async fn process_routing_decision_with_hashes(
        &self,
        worker: WorkerWithDpRank,
        local_hashes: Vec<LocalBlockHash>,

--- a/lib/kv-router/src/indexer/sharded.rs
+++ b/lib/kv-router/src/indexer/sharded.rs
@@ -353,6 +353,21 @@ impl KvIndexerSharded {
    ) -> Self {
        Self::new_with_frequency(token, num_shards, None, kv_block_size, metrics, None)
    }
+    fn shard_for_worker(&self, worker_id: WorkerId) -> usize {
+        *self.worker_assignments.entry(worker_id).or_insert_with(|| {
+            let worker_counts = self.worker_counts.lock().unwrap();
+            let selected_shard = worker_counts
+                .iter()
+                .enumerate()
+                .min_by_key(|&(_, value)| value)
+                .unwrap()
+                .0;
+            drop(worker_counts);
+            self.worker_counts.lock().unwrap()[selected_shard] += 1;
+            selected_shard
+        })
+    }
 }
 #[async_trait]
@@ -439,26 +454,8 @@ impl KvIndexerInterface for KvIndexerSharded {
    }
    async fn apply_event(&self, event: RouterEvent) {
-        let shard = self
+        let shard = self.shard_for_worker(event.worker_id);
-            .worker_assignments
+        self.event_tx[shard].send(event).await.unwrap();
-            .entry(event.worker_id)
-            .or_insert_with(|| {
-                // Get the shard with the smallest amount of workers.
-                let worker_counts = self.worker_counts.lock().unwrap();
-                let selected_shard = worker_counts
-                    .iter()
-                    .enumerate()
-                    .min_by_key(|&(_, value)| value)
-                    .unwrap()
-                    .0;
-                drop(worker_counts);
-                // Increment the count for this shard
-                self.worker_counts.lock().unwrap()[selected_shard] += 1;
-                selected_shard
-            });
-        self.event_tx[*shard].send(event).await.unwrap();
    }
    async fn remove_worker(&self, worker: WorkerId) {
@@ -525,7 +522,7 @@ impl KvIndexerInterface for KvIndexerSharded {
        let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec();
        let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec();
-        self.process_routing_decision_internal(worker, local_hashes, sequence_hashes)
+        self.process_routing_decision_with_hashes(worker, local_hashes, sequence_hashes)
            .await
    }
@@ -550,19 +547,14 @@ impl KvIndexerInterface for KvIndexerSharded {
 }
 impl KvIndexerSharded {
-    /// Internal method to process a routing decision with pre-computed hashes.
+    /// Process a routing decision with pre-computed hashes.
-    async fn process_routing_decision_internal(
+    pub async fn process_routing_decision_with_hashes(
        &self,
        worker: WorkerWithDpRank,
        local_hashes: Vec<LocalBlockHash>,
        sequence_hashes: Vec<SequenceHash>,
    ) -> Result<(), KvRouterError> {
-        // Route to the appropriate shard based on worker assignment
+        let shard_idx = self.shard_for_worker(worker.worker_id);
-        let shard_idx = self
-            .worker_assignments
-            .get(&worker.worker_id)
-            .map(|shard_idx| *shard_idx)
-            .unwrap_or_default();
        self.routing_tx[shard_idx]
            .send(RoutingDecisionRequest {

--- a/lib/kv-router/src/indexer/tests.rs
+++ b/lib/kv-router/src/indexer/tests.rs
@@ -13,6 +13,7 @@ use super::concurrent_radix_tree::ConcurrentRadixTree;
 use super::concurrent_radix_tree_compressed::ConcurrentRadixTreeCompressed;
 use super::positional::PositionalIndexer;
 use super::*;
+use crate::indexer::pruning::PruneConfig;
 use crate::protocols::*;
 use crate::test_utils::{remove_event, router_event, stored_blocks_with_sequence_hashes};
@@ -1889,6 +1890,37 @@ fn make_tree_indexer_with_frequency(
    }
 }
+#[tokio::test]
+async fn test_sharded_routing_decision_assigns_first_seen_worker() {
+    let token = CancellationToken::new();
+    let metrics = Arc::new(KvIndexerMetrics::new_unregistered());
+    let index = KvIndexerSharded::new_with_frequency(
+        token,
+        4,
+        Some(Duration::from_secs(60)),
+        32,
+        metrics,
+        Some(PruneConfig::default()),
+    );
+    let worker = WorkerWithDpRank::new(42, 0);
+    let local_hashes = vec![LocalBlockHash(11), LocalBlockHash(22)];
+    let sequence_hashes = compute_seq_hash_for_block(&local_hashes);
+    index
+        .process_routing_decision_with_hashes(worker, local_hashes.clone(), sequence_hashes)
+        .await
+        .unwrap();
+    flush_and_settle(&index).await;
+    assert_score(&index, &[11, 22], worker, 2).await;
+    index.remove_worker(worker.worker_id).await;
+    flush_and_settle(&index).await;
+    let scores = query_scores(&index, &[11, 22]).await;
+    assert!(!scores.scores.contains_key(&worker));
+}
 mod tree_specific_tests {
    use super::*;
    use rstest_reuse::apply;

--- a/lib/kv-router/src/indexer/types.rs
+++ b/lib/kv-router/src/indexer/types.rs
@@ -110,14 +110,14 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for WorkerKvQueryRespons
 /// Endpoint name for the standalone KV indexer query service.
 pub const KV_INDEXER_QUERY_ENDPOINT: &str = "kv_indexer_query";
+/// Endpoint name for recording approximate-mode routing decisions on a remote indexer.
+pub const KV_INDEXER_RECORD_ROUTING_DECISION_ENDPOINT: &str = "kv_indexer_record_routing_decision";
-/// Request to query the standalone KV indexer for overlap scores.
+/// Request to query a served KV indexer for overlap scores.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct IndexerQueryRequest {
    /// Model name to query the indexer for.
    pub model_name: String,
-    /// Dynamo namespace (used as tenant_id for indexer lookup).
-    pub namespace: String,
    /// Block hashes to find matches for in the radix tree.
    pub block_hashes: Vec<LocalBlockHash>,
 }
@@ -153,7 +153,7 @@ impl From<WireOverlapScores> for OverlapScores {
    }
 }
-/// Response from the standalone KV indexer.
+/// Response from a served KV indexer query.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub enum IndexerQueryResponse {
    /// Overlap scores per worker.
@@ -191,6 +191,57 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for IndexerQueryResponse
    }
 }
+/// Request to record a routing decision on a served approximate-mode indexer.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct IndexerRecordRoutingDecisionRequest {
+    /// Model name to update.
+    pub model_name: String,
+    /// Selected worker for this routing decision.
+    pub worker: WorkerWithDpRank,
+    /// Locally-computed block hashes for the routed request.
+    pub local_hashes: Vec<LocalBlockHash>,
+    /// Locally-computed rolling sequence hashes for the routed request.
+    pub sequence_hashes: Vec<SequenceHash>,
+}
+/// Response from a served approximate-mode routing-decision endpoint.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub enum IndexerRecordRoutingDecisionResponse {
+    Recorded,
+    Error(String),
+}
+impl MaybeError for IndexerRecordRoutingDecisionResponse {
+    fn from_err(err: impl std::error::Error + 'static) -> Self {
+        IndexerRecordRoutingDecisionResponse::Error(err.to_string())
+    }
+    fn err(&self) -> Option<Box<dyn std::error::Error + Send + Sync>> {
+        match self {
+            IndexerRecordRoutingDecisionResponse::Error(msg) => {
+                Some(Box::new(std::io::Error::other(msg.clone())))
+            }
+            _ => None,
+        }
+    }
+}
+#[cfg(feature = "runtime-protocols")]
+impl dynamo_runtime::protocols::maybe_error::MaybeError for IndexerRecordRoutingDecisionResponse {
+    fn from_err(err: impl std::error::Error + 'static) -> Self {
+        IndexerRecordRoutingDecisionResponse::Error(err.to_string())
+    }
+    fn err(&self) -> Option<dynamo_runtime::error::DynamoError> {
+        match self {
+            IndexerRecordRoutingDecisionResponse::Error(msg) => {
+                Some(dynamo_runtime::error::DynamoError::msg(msg.clone()))
+            }
+            _ => None,
+        }
+    }
+}
 /// A request to find matches in the Radix Tree.
 pub struct MatchRequest {
    /// A vector of `LocalBlockHash` representing the sequence to match.

--- a/lib/kv-router/src/scheduling/config.rs
+++ b/lib/kv-router/src/scheduling/config.rs
@@ -204,12 +204,16 @@ pub struct KvRouterConfig {
    /// "wspt": weighted shortest processing time (Smith's rule) — optimizes average TTFT.
    pub router_queue_policy: RouterQueuePolicy,
-    /// Component name of a standalone KV indexer to use for overlap scoring.
+    /// Whether to query a remote KV indexer served from the worker component
-    /// When set, the router creates a `Remote` indexer that queries the standalone
+    /// instead of maintaining a local radix tree for overlap scoring.
-    /// indexer via the request plane instead of maintaining a local radix tree.
-    /// The standalone indexer handles its own event subscription and discovery.
    #[serde(default)]
-    pub remote_indexer_component: Option<String>,
+    pub use_remote_indexer: bool,
+    /// Whether this router should serve its local indexer from the worker component.
+    /// This enables other routers/frontends in the same namespace to query
+    /// overlap scores remotely over the request plane by component + endpoint.
+    #[serde(default)]
+    pub serve_indexer: bool,
 }
 impl Default for KvRouterConfig {
@@ -234,7 +238,8 @@ impl Default for KvRouterConfig {
            router_event_threads: 4,
            skip_initial_worker_wait: false,
            router_queue_policy: RouterQueuePolicy::default(),
-            remote_indexer_component: None,
+            use_remote_indexer: false,
+            serve_indexer: false,
        }
    }
 }
@@ -268,6 +273,16 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr
            "router_prefill_load_model currently requires router_queue_policy='fcfs'",
        ));
    }
+    if config.use_remote_indexer && config.serve_indexer {
+        return Err(ValidationError::new(
+            "use_remote_indexer and serve_indexer are mutually exclusive",
+        ));
+    }
+    if config.serve_indexer && config.overlap_score_weight == 0.0 {
+        return Err(ValidationError::new(
+            "serve_indexer requires overlap_score_weight > 0",
+        ));
+    }
    Ok(())
 }

--- a/lib/kv-router/src/standalone_indexer/mod.rs
+++ b/lib/kv-router/src/standalone_indexer/mod.rs
@@ -6,8 +6,6 @@ pub mod listener;
 pub mod metrics;
 pub mod recovery;
 pub mod registry;
-#[cfg(feature = "indexer-runtime")]
-pub mod runtime;
 pub mod server;
 mod zmq;
@@ -31,13 +29,6 @@ pub struct IndexerConfig {
    pub peers: Option<String>,
 }
-#[cfg(feature = "indexer-runtime")]
-pub struct RuntimeConfig {
-    pub namespace: String,
-    pub component_name: String,
-    pub worker_component: String,
-}
 pub(super) fn validate_zmq_endpoint(endpoint: &str) -> anyhow::Result<()> {
    let (scheme, address) = endpoint
        .split_once("://")
@@ -155,81 +146,6 @@ pub async fn run_server(config: IndexerConfig) -> anyhow::Result<()> {
    run_common(&config, &registry, cancel_token).await
 }
-#[cfg(feature = "indexer-runtime")]
-pub async fn run_with_runtime(
-    runtime: dynamo_runtime::Runtime,
-    config: IndexerConfig,
-    runtime_config: RuntimeConfig,
-) -> anyhow::Result<()> {
-    use dynamo_runtime::{
-        DistributedRuntime,
-        pipeline::{ManyOut, SingleIn, network::Ingress},
-    };
-    use crate::indexer::{IndexerQueryRequest, IndexerQueryResponse, KV_INDEXER_QUERY_ENDPOINT};
-    let distributed_runtime = DistributedRuntime::from_settings(runtime).await?;
-    let cancel_token = distributed_runtime.primary_token();
-    let component = distributed_runtime
-        .namespace(&runtime_config.namespace)?
-        .component(&runtime_config.component_name)?;
-    tracing::info!(
-        namespace = %runtime_config.namespace,
-        component = %runtime_config.component_name,
-        block_size = ?config.block_size,
-        port = config.port,
-        threads = config.threads,
-        model_name = %config.model_name,
-        tenant_id = %config.tenant_id,
-        worker_component = %runtime_config.worker_component,
-        num_peers = config.peers.as_ref().map(|p| p.split(',').count()).unwrap_or(0),
-        "Starting standalone KV cache indexer (Dynamo runtime mode)"
-    );
-    let registry = Arc::new(WorkerRegistry::new(config.threads));
-    let engine = Arc::new(runtime::query_engine::IndexerQueryEngine {
-        registry: registry.clone(),
-    });
-    let ingress =
-        Ingress::<SingleIn<IndexerQueryRequest>, ManyOut<IndexerQueryResponse>>::for_engine(
-            engine,
-        )?;
-    let query_endpoint = component
-        .endpoint(KV_INDEXER_QUERY_ENDPOINT)
-        .endpoint_builder()
-        .handler(ingress)
-        .graceful_shutdown(true);
-    distributed_runtime.runtime().secondary().spawn(async move {
-        if let Err(err) = query_endpoint.start().await {
-            tracing::error!(error = %err, "Query endpoint failed");
-        }
-    });
-    tracing::info!(
-        endpoint = KV_INDEXER_QUERY_ENDPOINT,
-        "Query endpoint registered"
-    );
-    runtime::discovery::spawn_discovery_watcher(
-        &distributed_runtime,
-        registry.clone(),
-        cancel_token.clone(),
-    )
-    .await?;
-    runtime::subscriber::spawn_event_subscriber(
-        &distributed_runtime,
-        &runtime_config.namespace,
-        &runtime_config.worker_component,
-        registry.clone(),
-        cancel_token.clone(),
-    )
-    .await?;
-    run_common(&config, &registry, cancel_token).await
-}
 async fn wait_for_min_initial_workers(
    registry: &WorkerRegistry,
    cancel_token: &CancellationToken,

--- a/lib/kv-router/src/standalone_indexer/registry.rs
+++ b/lib/kv-router/src/standalone_indexer/registry.rs
@@ -314,8 +314,6 @@ pub struct WorkerRegistry {
    indexers: DashMap<IndexerKey, IndexerEntry>,
    peers: DashMap<String, ()>,
    watermarks: DashMap<(WorkerId, u32), Arc<AtomicU64>>,
-    #[cfg(feature = "indexer-runtime")]
-    discovered_workers: DashMap<WorkerId, IndexerKey>,
    num_threads: usize,
    ready_tx: watch::Sender<bool>,
    ready_rx: watch::Receiver<bool>,
@@ -329,8 +327,6 @@ impl WorkerRegistry {
            indexers: DashMap::new(),
            peers: DashMap::new(),
            watermarks: DashMap::new(),
-            #[cfg(feature = "indexer-runtime")]
-            discovered_workers: DashMap::new(),
            num_threads,
            ready_tx,
            ready_rx,
@@ -360,16 +356,7 @@ impl WorkerRegistry {
    #[cfg(feature = "metrics")]
    pub fn refresh_metrics(&self) {
        let models = self.indexers.len();
-        let workers = self.workers.len() + {
+        let workers = self.workers.len();
-            #[cfg(feature = "indexer-runtime")]
-            {
-                self.discovered_workers.len()
-            }
-            #[cfg(not(feature = "indexer-runtime"))]
-            {
-                0
-            }
-        };
        let mut listener_counts = [0_i64; 4];
        for entry in self.workers.iter() {
@@ -392,14 +379,6 @@ impl WorkerRegistry {
        block_size: u32,
        replay_endpoint: Option<String>,
    ) -> Result<()> {
-        #[cfg(feature = "indexer-runtime")]
-        if self.discovered_workers.contains_key(&instance_id) {
-            bail!(
-                "instance {instance_id} is already registered via discovery; \
-                 use the Dynamo runtime to manage it"
-            );
-        }
        let key = IndexerKey {
            model_name,
            tenant_id,
@@ -495,24 +474,10 @@ impl WorkerRegistry {
                    entry.key.tenant_id
                );
            }
-        } else {
-            #[cfg(feature = "indexer-runtime")]
-            if let Some(discovered_key) = self.discovered_workers.get(&instance_id) {
-                if discovered_key.value() != &key {
-                    bail!(
-                        "instance {instance_id} is registered for model={} tenant={}",
-                        discovered_key.value().model_name,
-                        discovered_key.value().tenant_id
-                    );
-                }
        } else {
            bail!("instance {instance_id} not found");
        }
-            #[cfg(not(feature = "indexer-runtime"))]
-            bail!("instance {instance_id} not found");
-        }
        if let Some((_, entry)) = self.workers.remove(&instance_id) {
            for record in entry.listeners.values() {
                if let Some(cancel_token) = record.take_cancel() {
@@ -522,11 +487,6 @@ impl WorkerRegistry {
            for &dp_rank in entry.listeners.keys() {
                self.watermarks.remove(&(instance_id, dp_rank));
            }
-        } else {
-            #[cfg(feature = "indexer-runtime")]
-            {
-                self.discovered_workers.remove(&instance_id);
-            }
        }
        if let Some(ie) = self.indexers.get(&key) {
@@ -602,21 +562,6 @@ impl WorkerRegistry {
            }
            entry.key.clone()
        } else {
-            #[cfg(feature = "indexer-runtime")]
-            if let Some(discovered_key) = self.discovered_workers.get(&instance_id) {
-                if discovered_key.value().model_name != model_name {
-                    bail!(
-                        "instance {instance_id} is registered for model={} tenant={}",
-                        discovered_key.value().model_name,
-                        discovered_key.value().tenant_id
-                    );
-                }
-                discovered_key.value().clone()
-            } else {
-                bail!("instance {instance_id} not found");
-            }
-            #[cfg(not(feature = "indexer-runtime"))]
            bail!("instance {instance_id} not found");
        };
@@ -629,11 +574,6 @@ impl WorkerRegistry {
            for &dp_rank in entry.listeners.keys() {
                self.watermarks.remove(&(instance_id, dp_rank));
            }
-        } else {
-            #[cfg(feature = "indexer-runtime")]
-            {
-                self.discovered_workers.remove(&instance_id);
-            }
        }
        if let Some(ie) = self.indexers.get(&key) {
@@ -656,11 +596,6 @@ impl WorkerRegistry {
                },
            )?
        } else {
-            #[cfg(feature = "indexer-runtime")]
-            if self.discovered_workers.contains_key(&instance_id) {
-                return Err(ListenerControlError::DiscoveryManaged { instance_id });
-            }
            return Err(ListenerControlError::WorkerNotFound { instance_id });
        };
@@ -683,11 +618,6 @@ impl WorkerRegistry {
                },
            )?
        } else {
-            #[cfg(feature = "indexer-runtime")]
-            if self.discovered_workers.contains_key(&instance_id) {
-                return Err(ListenerControlError::DiscoveryManaged { instance_id });
-            }
            return Err(ListenerControlError::WorkerNotFound { instance_id });
        };
@@ -724,21 +654,6 @@ impl WorkerRegistry {
            })
            .collect();
-        #[cfg(feature = "indexer-runtime")]
-        for entry in self.discovered_workers.iter() {
-            let worker_id = *entry.key();
-            if self.workers.contains_key(&worker_id) {
-                continue;
-            }
-            result.push(WorkerInfo {
-                instance_id: worker_id,
-                source: WorkerSource::Discovery,
-                status: ListenerStatus::Active,
-                endpoints: HashMap::new(),
-                listeners: HashMap::new(),
-            });
-        }
        result
    }
@@ -784,97 +699,6 @@ impl WorkerRegistry {
            .collect()
    }
-    #[cfg(feature = "indexer-runtime")]
-    pub fn add_worker_from_discovery(
-        &self,
-        instance_id: WorkerId,
-        model_name: String,
-        tenant_id: String,
-        block_size: u32,
-    ) -> Result<()> {
-        if self.workers.contains_key(&instance_id) {
-            bail!(
-                "instance {instance_id} is already manually registered; \
-                 cannot add via discovery"
-            );
-        }
-        let key = IndexerKey {
-            model_name,
-            tenant_id,
-        };
-        if let Some(existing) = self.discovered_workers.get(&instance_id) {
-            if existing.value() != &key {
-                bail!(
-                    "instance {instance_id} is already registered for model={} tenant={}",
-                    existing.value().model_name,
-                    existing.value().tenant_id
-                );
-            }
-            return Ok(());
-        }
-        let indexer_entry = self.indexers.entry(key.clone()).or_insert_with(|| {
-            tracing::info!(
-                model_name = %key.model_name,
-                tenant_id = %key.tenant_id,
-                block_size,
-                "Creating new indexer (discovery)"
-            );
-            IndexerEntry {
-                indexer: create_indexer(block_size, self.num_threads),
-                block_size,
-            }
-        });
-        if indexer_entry.block_size != block_size {
-            bail!(
-                "block_size mismatch for model={} tenant={}: existing={}, requested={}",
-                key.model_name,
-                key.tenant_id,
-                indexer_entry.block_size,
-                block_size
-            );
-        }
-        drop(indexer_entry);
-        self.discovered_workers.insert(instance_id, key);
-        Ok(())
-    }
-    #[cfg(feature = "indexer-runtime")]
-    pub async fn remove_worker_from_discovery(&self, instance_id: WorkerId) {
-        if let Some((_, key)) = self.discovered_workers.remove(&instance_id) {
-            if let Some(ie) = self.indexers.get(&key) {
-                ie.indexer.remove_worker(instance_id).await;
-            }
-            self.maybe_remove_indexer(&key);
-        } else {
-            tracing::debug!(
-                instance_id,
-                "remove_worker_from_discovery: worker not in discovered_workers map"
-            );
-        }
-    }
-    #[cfg(feature = "indexer-runtime")]
-    pub fn get_indexer_for_worker(&self, worker_id: WorkerId) -> Option<Indexer> {
-        if let Some(key) = self.discovered_workers.get(&worker_id)
-            && let Some(ie) = self.indexers.get(key.value())
-        {
-            return Some(ie.indexer.clone());
-        }
-        if let Some(entry) = self.workers.get(&worker_id)
-            && let Some(ie) = self.indexers.get(&entry.key)
-        {
-            return Some(ie.indexer.clone());
-        }
-        None
-    }
    fn spawn_listener(
        &self,
        instance_id: WorkerId,
@@ -897,15 +721,6 @@ impl WorkerRegistry {
            return;
        }
-        #[cfg(feature = "indexer-runtime")]
-        if self
-            .discovered_workers
-            .iter()
-            .any(|entry| entry.value() == key)
-        {
-            return;
-        }
        self.indexers.remove(key);
    }
 }

--- a/lib/kv-router/src/standalone_indexer/runtime/discovery.rs
+++ b/lib/kv-router/src/standalone_indexer/runtime/discovery.rs
-// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-use std::sync::Arc;
-use dynamo_runtime::stream::StreamExt;
-use dynamo_runtime::{
-    DistributedRuntime,
-    discovery::{
-        DiscoveryEvent, DiscoveryInstance, DiscoveryInstanceId, DiscoveryQuery, DiscoveryStream,
-    },
-};
-use serde::Deserialize;
-use tokio_util::sync::CancellationToken;
-use crate::standalone_indexer::registry::WorkerRegistry;
-#[derive(Deserialize, Debug)]
-struct PartialModelCard {
-    pub display_name: String,
-    #[serde(default)]
-    pub kv_cache_block_size: u32,
-}
-pub async fn spawn_discovery_watcher(
-    drt: &DistributedRuntime,
-    registry: Arc<WorkerRegistry>,
-    cancel_token: CancellationToken,
-) -> anyhow::Result<()> {
-    let discovery = drt.discovery();
-    let mut stream: DiscoveryStream = discovery
-        .list_and_watch(DiscoveryQuery::AllModels, Some(cancel_token.clone()))
-        .await?;
-    tokio::spawn(async move {
-        tracing::info!("Discovery watcher started");
-        while let Some(result) = stream.next().await {
-            let event = match result {
-                Ok(event) => event,
-                Err(err) => {
-                    tracing::error!(%err, "Error in discovery stream");
-                    continue;
-                }
-            };
-            match event {
-                DiscoveryEvent::Added(instance) => {
-                    let (instance_id, namespace, card) = match &instance {
-                        DiscoveryInstance::Model {
-                            instance_id,
-                            namespace,
-                            ..
-                        } => match instance.deserialize_model::<PartialModelCard>() {
-                            Ok(card) => (*instance_id, namespace.clone(), card),
-                            Err(err) => {
-                                tracing::error!(%err, instance_id, "Failed to deserialize model card");
-                                continue;
-                            }
-                        },
-                        _ => {
-                            tracing::debug!("Ignoring non-model discovery instance");
-                            continue;
-                        }
-                    };
-                    let model_name = card.display_name.clone();
-                    let block_size = card.kv_cache_block_size;
-                    let tenant_id = namespace;
-                    if block_size == 0 {
-                        tracing::warn!(
-                            instance_id,
-                            model_name,
-                            "Skipping worker with kv_cache_block_size=0"
-                        );
-                        continue;
-                    }
-                    tracing::info!(
-                        instance_id,
-                        model_name,
-                        tenant_id,
-                        block_size,
-                        "Discovery: adding worker"
-                    );
-                    if let Err(err) = registry.add_worker_from_discovery(
-                        instance_id,
-                        model_name.clone(),
-                        tenant_id,
-                        block_size,
-                    ) {
-                        tracing::error!(
-                            instance_id,
-                            model_name,
-                            error = %err,
-                            "Failed to add discovered worker"
-                        );
-                    }
-                }
-                DiscoveryEvent::Removed(id) => {
-                    let instance_id = match &id {
-                        DiscoveryInstanceId::Model(mcid) => mcid.instance_id,
-                        _ => {
-                            tracing::debug!("Ignoring non-model discovery removal");
-                            continue;
-                        }
-                    };
-                    tracing::info!(instance_id, "Discovery: removing worker");
-                    registry.remove_worker_from_discovery(instance_id).await;
-                }
-            }
-        }
-        tracing::info!("Discovery watcher exiting");
-    });
-    Ok(())
-}