"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "78dcf56cb31f8abb7d9ae3ba02ce9822eba34820"
Unverified Commit 49eb397a authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(kv-router): split Dynamo-native remote indexer [DYN-2593] (#7973)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent d232b450
...@@ -36,7 +36,8 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = ( ...@@ -36,7 +36,8 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
"router_queue_threshold", "router_queue_threshold",
"router_event_threads", "router_event_threads",
"router_queue_policy", "router_queue_policy",
"remote_indexer_component", "use_remote_indexer",
"serve_indexer",
) )
...@@ -61,7 +62,8 @@ class KvRouterConfigBase(ConfigBase): ...@@ -61,7 +62,8 @@ class KvRouterConfigBase(ConfigBase):
router_queue_threshold: Optional[float] router_queue_threshold: Optional[float]
router_event_threads: int router_event_threads: int
router_queue_policy: str router_queue_policy: str
remote_indexer_component: Optional[str] use_remote_indexer: bool = False
serve_indexer: bool = False
def kv_router_kwargs(self) -> dict: def kv_router_kwargs(self) -> dict:
"""Return a dict suitable for ``KvRouterConfig(**kwargs)``.""" """Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
...@@ -286,15 +288,14 @@ class KvRouterArgGroup(ArgGroup): ...@@ -286,15 +288,14 @@ class KvRouterArgGroup(ArgGroup):
arg_type=str, arg_type=str,
choices=["fcfs", "wspt"], choices=["fcfs", "wspt"],
) )
add_argument( add_negatable_bool_argument(
g, g,
flag_name="--remote-indexer-component", flag_name="--use-remote-indexer",
env_var="DYN_REMOTE_INDEXER_COMPONENT", env_var="DYN_USE_REMOTE_INDEXER",
default=None, default=False,
help=( help=(
"[EXPERIMENTAL] KV Router: Component name of a standalone KV indexer to use for overlap scoring. " "[EXPERIMENTAL] KV Router: Query a remote KV indexer served from the worker "
"When set, the router queries the standalone indexer via the request plane instead " "component via the request plane instead of maintaining a local radix tree."
"of maintaining a local radix tree (e.g. 'kv-indexer')."
), ),
arg_type=str, dest="use_remote_indexer",
) )
...@@ -130,6 +130,13 @@ class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase): ...@@ -130,6 +130,13 @@ class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
"--router-prefill-load-model=aic requires " "--router-prefill-load-model=aic requires "
"--router-track-prefill-tokens" "--router-track-prefill-tokens"
) )
if self.serve_indexer:
if self.router_mode != "kv":
raise ValueError("--serve-indexer requires --router-mode=kv")
if self.use_remote_indexer:
raise ValueError(
"--serve-indexer and --use-remote-indexer are mutually exclusive"
)
@register_encoder(FrontendConfig) @register_encoder(FrontendConfig)
...@@ -193,6 +200,14 @@ class FrontendArgGroup(ArgGroup): ...@@ -193,6 +200,14 @@ class FrontendArgGroup(ArgGroup):
help="HTTP port for the engine (u16).", help="HTTP port for the engine (u16).",
arg_type=int, arg_type=int,
) )
add_negatable_bool_argument(
g,
flag_name="--serve-indexer",
env_var="DYN_SERVE_INDEXER",
default=False,
help="Serve this frontend's local KV indexers over the request plane.",
dest="serve_indexer",
)
add_argument( add_argument(
g, g,
flag_name="--tls-cert-path", flag_name="--tls-cert-path",
......
...@@ -15,7 +15,7 @@ from dynamo.common.configuration.groups.kv_router_args import ( ...@@ -15,7 +15,7 @@ from dynamo.common.configuration.groups.kv_router_args import (
KvRouterArgGroup, KvRouterArgGroup,
KvRouterConfigBase, KvRouterConfigBase,
) )
from dynamo.common.configuration.utils import add_argument from dynamo.common.configuration.utils import add_argument, add_negatable_bool_argument
from dynamo.llm import AicPerfConfig, KvRouterConfig from dynamo.llm import AicPerfConfig, KvRouterConfig
...@@ -25,6 +25,7 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase): ...@@ -25,6 +25,7 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
namespace: str namespace: str
endpoint: str endpoint: str
router_block_size: int router_block_size: int
serve_indexer: bool = False
def validate(self) -> None: def validate(self) -> None:
"""Validate config invariants (aligned with Rust KvRouterConfig where applicable).""" """Validate config invariants (aligned with Rust KvRouterConfig where applicable)."""
...@@ -40,6 +41,10 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase): ...@@ -40,6 +41,10 @@ class DynamoRouterConfig(KvRouterConfigBase, AicPerfConfigBase):
"Expected format: namespace.component.endpoint" "Expected format: namespace.component.endpoint"
) )
self.namespace = parts[0] self.namespace = parts[0]
if self.serve_indexer and self.use_remote_indexer:
raise ValueError(
"--serve-indexer and --use-remote-indexer are mutually exclusive"
)
if self.router_prefill_load_model == "aic": if self.router_prefill_load_model == "aic":
missing = [ missing = [
flag flag
...@@ -89,6 +94,15 @@ class DynamoRouterArgGroup(ArgGroup): ...@@ -89,6 +94,15 @@ class DynamoRouterArgGroup(ArgGroup):
obsolete_flag="--block-size", obsolete_flag="--block-size",
) )
add_negatable_bool_argument(
g,
flag_name="--serve-indexer",
env_var="DYN_SERVE_INDEXER",
default=False,
help="Serve this router's local KV indexer over the request plane.",
dest="serve_indexer",
)
# KV router options (shared with dynamo.frontend) # KV router options (shared with dynamo.frontend)
KvRouterArgGroup().add_arguments(parser) KvRouterArgGroup().add_arguments(parser)
AicPerfArgGroup().add_arguments(parser) AicPerfArgGroup().add_arguments(parser)
......
...@@ -439,9 +439,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \ ...@@ -439,9 +439,9 @@ RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
uv build --wheel --out-dir /opt/dynamo/dist && \ uv build --wheel --out-dir /opt/dynamo/dist && \
cd /opt/dynamo/lib/bindings/python && \ cd /opt/dynamo/lib/bindings/python && \
if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \ if [ "$ENABLE_MEDIA_FFMPEG" = "true" ]; then \
maturin build --release --features "media-ffmpeg,kv-indexer,kv-indexer-runtime" --out /opt/dynamo/dist; \ maturin build --release --features "media-ffmpeg,kv-indexer" --out /opt/dynamo/dist; \
else \ else \
maturin build --release --features "kv-indexer,kv-indexer-runtime" --out /opt/dynamo/dist; \ maturin build --release --features "kv-indexer" --out /opt/dynamo/dist; \
fi && \ fi && \
/tmp/use-sccache.sh show-stats "Dynamo Runtime" /tmp/use-sccache.sh show-stats "Dynamo Runtime"
......
...@@ -42,7 +42,7 @@ When using KV routing, the router needs to know what each worker has cached. The ...@@ -42,7 +42,7 @@ When using KV routing, the router needs to know what each worker has cached. The
|------------|---------------|-------------| |------------|---------------|-------------|
| **NATS Core (local indexer)** | Default (no extra flags) | Workers maintain a local indexer; router queries workers on startup and receives events via NATS Core | | **NATS Core (local indexer)** | Default (no extra flags) | Workers maintain a local indexer; router queries workers on startup and receives events via NATS Core |
| **JetStream (durable)** | `--router-durable-kv-events` | Events persisted in NATS JetStream; supports snapshots and durable consumers. *Deprecated.* | | **JetStream (durable)** | `--router-durable-kv-events` | Events persisted in NATS JetStream; supports snapshots and durable consumers. *Deprecated.* |
| **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; standalone indexer aggregates events | | **ZMQ** | `--event-plane zmq` | Workers publish via ZMQ PUB sockets; the standalone `dynamo.indexer` service aggregates events |
| **Approximate (no events)** | `--no-router-kv-events` | No events consumed; router predicts cache state from its own routing decisions with TTL-based expiration | | **Approximate (no events)** | `--no-router-kv-events` | No events consumed; router predicts cache state from its own routing decisions with TTL-based expiration |
### Aggregated vs. Disaggregated Topology ### Aggregated vs. Disaggregated Topology
...@@ -93,6 +93,8 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -93,6 +93,8 @@ Backend workers register themselves using the `register_model` API, after which
| `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration | | `--router-prefill-load-model <none\|aic>` | `none` | Prompt-side load model. `aic` decays only the oldest active prefill using an AIC-predicted duration |
| `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` | | `--router-queue-threshold <float>` | `4.0` | Queue threshold fraction; enables priority scheduling via `priority` |
| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) | | `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT), `wspt` (avg TTFT), or `lcfs` (comparison-only reverse ordering) |
| `--serve-indexer` | `false` | Serve the Dynamo-native remote indexer from this frontend/router on the worker component |
| `--use-remote-indexer` | `false` | Query the worker component's served remote indexer instead of maintaining a local overlap indexer |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -444,6 +446,63 @@ graph TD ...@@ -444,6 +446,63 @@ graph TD
For improved fault tolerance, you can launch multiple frontend + router replicas. If multiple `dynamo.frontend` processes share the same host or network namespace, give each instance a different HTTP port. In Kubernetes or on separate hosts, replicas can usually reuse the same container port. Alternatively, you can deploy the router separately as the standalone `python -m dynamo.router` service; see the [Standalone Router README](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/router/README.md). For improved fault tolerance, you can launch multiple frontend + router replicas. If multiple `dynamo.frontend` processes share the same host or network namespace, give each instance a different HTTP port. In Kubernetes or on separate hosts, replicas can usually reuse the same container port. Alternatively, you can deploy the router separately as the standalone `python -m dynamo.router` service; see the [Standalone Router README](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/router/README.md).
### Dynamo-Native Remote Indexer
For Dynamo-native deployments, the remote indexer is served by `dynamo.frontend` or `dynamo.router`, not by `dynamo.indexer`.
- Use `--serve-indexer` on router/frontend replicas that should expose `kv_indexer_query` from the worker component.
- Use `--use-remote-indexer` on consumer routers/frontends that should query that served endpoint instead of maintaining a local overlap indexer.
- `dynamo.indexer` remains the standalone HTTP + ZMQ microservice for non-Dynamo / direct-ZMQ deployments.
Frontend example:
```bash
# Serving anchors
python -m dynamo.frontend --router-mode kv --serve-indexer
# Consumer frontend
python -m dynamo.frontend --router-mode kv --use-remote-indexer
```
The served service is request-plane only. Each serving router/frontend keeps its normal local KV event ingestion, gap detection, and worker-query recovery path; remote consumers only issue hash-based overlap queries.
Approximate mode (`--no-router-kv-events`) is singleton-only for remote serving: only one `--serve-indexer` replica may exist for a given worker component. Event-driven mode allows multiple serving replicas behind the same worker component.
```mermaid
graph TD
subgraph "Workers"
W1["Worker 1"]
W2["Worker 2"]
end
subgraph "Event Plane"
EP["KV Events"]
end
subgraph "Serving Routers / Frontends"
S1["Router / Frontend A<br/>--serve-indexer"]
S2["Router / Frontend B<br/>--serve-indexer"]
I1["Local Indexer"]
I2["Local Indexer"]
end
subgraph "Request Plane"
RP["backend.kv_indexer_query"]
end
C["Consumer Router / Frontend<br/>--use-remote-indexer"]
W1 --> EP
W2 --> EP
EP --> S1
EP --> S2
S1 --> I1
S2 --> I2
C --> RP
RP --> S1
RP --> S2
```
### Router State Management ### Router State Management
The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details): The KV Router tracks two types of state (see [Router Design](../../design-docs/router-design.md) for details):
......
...@@ -7,13 +7,16 @@ subtitle: Run the KV cache indexer as an independent HTTP service for querying b ...@@ -7,13 +7,16 @@ subtitle: Run the KV cache indexer as an independent HTTP service for querying b
## Overview ## Overview
The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers. It supports two operational modes: The standalone KV indexer (`python -m dynamo.indexer`) is a lightweight service that maintains a radix tree of cached blocks and exposes HTTP endpoints for querying and managing workers.
- **Standalone mode** (default): subscribes to ZMQ KV event streams directly from workers. No Dynamo runtime discovery, registration, or event-plane integration required. - It subscribes to ZMQ KV event streams directly from workers.
- **Dynamo runtime mode** (`--dynamo-runtime`): integrates with the Dynamo runtime for automatic worker discovery via MDC, KV event ingestion via the event plane (NATS or ZMQ), and overlap queries over the request plane for remote frontends. - It exposes an HTTP API for registration, inspection, and overlap queries.
- It preserves P2P recovery and gap detection/replay for the standalone ZMQ path.
This is distinct from the [Standalone Router](../../../components/src/dynamo/router/README.md), which is a full routing service. The standalone indexer provides only the indexing and query layer without routing logic. This is distinct from the [Standalone Router](../../../components/src/dynamo/router/README.md), which is a full routing service. The standalone indexer provides only the indexing and query layer without routing logic.
For Dynamo-native remote indexing, use `--serve-indexer` on `dynamo.frontend` or `dynamo.router` and `--use-remote-indexer` on consumers instead. That request-plane service reuses the router's existing event ingestion and recovery machinery; it is not implemented by `dynamo.indexer`.
The HTTP API follows the [Mooncake KV Indexer RFC](https://github.com/kvcache-ai/Mooncake/issues/1403) conventions. The HTTP API follows the [Mooncake KV Indexer RFC](https://github.com/kvcache-ai/Mooncake/issues/1403) conventions.
`DYN_ROUTER_MIN_INITIAL_WORKERS` is also honored here. When set to a positive integer, the `DYN_ROUTER_MIN_INITIAL_WORKERS` is also honored here. When set to a positive integer, the
...@@ -30,9 +33,7 @@ The indexer maintains one radix tree per `(model_name, tenant_id)` pair. Workers ...@@ -30,9 +33,7 @@ The indexer maintains one radix tree per `(model_name, tenant_id)` pair. Workers
## Compatibility ## Compatibility
In standalone mode, the indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required. The standalone indexer works with any engine that publishes KV cache events over ZMQ in the expected msgpack format. This includes bare vLLM and SGLang engines, which emit ZMQ KV events natively — no Dynamo-specific wrapper is required.
In Dynamo runtime mode, the indexer discovers workers automatically via MDC and receives KV events through the event plane. It also registers a query endpoint on the request plane, allowing frontends to query overlap scores remotely without needing direct HTTP access.
## Use Cases ## Use Cases
...@@ -40,7 +41,7 @@ In Dynamo runtime mode, the indexer discovers workers automatically via MDC and ...@@ -40,7 +41,7 @@ In Dynamo runtime mode, the indexer discovers workers automatically via MDC and
- **State verification**: Confirm that the indexer's view of KV cache state matches the router's internal state (used in integration tests). - **State verification**: Confirm that the indexer's view of KV cache state matches the router's internal state (used in integration tests).
- **Custom routing**: Build external routing logic that queries the indexer for overlap scores and makes its own worker selection decisions. - **Custom routing**: Build external routing logic that queries the indexer for overlap scores and makes its own worker selection decisions.
- **Monitoring**: Observe KV cache distribution across workers without running a full router. - **Monitoring**: Observe KV cache distribution across workers without running a full router.
- **Remote indexing**: In Dynamo runtime mode, frontends can offload KV cache indexing to a dedicated service and query it over the request plane. - **Standalone microservice**: Run an indexer independently of the router/frontend when you want direct HTTP inspection and ZMQ-based ingestion.
## P2P Recovery ## P2P Recovery
...@@ -91,7 +92,6 @@ The service is exposed through the Python bindings package and launched with `py ...@@ -91,7 +92,6 @@ The service is exposed through the Python bindings package and launched with `py
|---------|-------------| |---------|-------------|
| `kv-indexer` | Core standalone indexer service path (`python -m dynamo.indexer`: HTTP API, ZMQ listeners, P2P recovery) | | `kv-indexer` | Core standalone indexer service path (`python -m dynamo.indexer`: HTTP API, ZMQ listeners, P2P recovery) |
| `kv-indexer-metrics` | Optional `/metrics` endpoint | | `kv-indexer-metrics` | Optional `/metrics` endpoint |
| `kv-indexer-runtime` | Dynamo runtime integration (`--dynamo-runtime`, discovery, event plane, request plane) |
### Standalone build ### Standalone build
...@@ -109,30 +109,12 @@ cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develo ...@@ -109,30 +109,12 @@ cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develo
This keeps the default `kv-indexer` build lean while still allowing Prometheus metrics when needed. This keeps the default `kv-indexer` build lean while still allowing Prometheus metrics when needed.
### Runtime-enabled build
```bash
cd lib/bindings/python && VIRTUAL_ENV=../../.venv ../../.venv/bin/maturin develop --uv --features kv-indexer,kv-indexer-runtime
```
This enables the `--dynamo-runtime` CLI flag for MDC discovery, event-plane subscription, and request-plane queries. It also includes the metrics endpoint.
## CLI ## CLI
### Standalone mode (default)
```bash ```bash
python -m dynamo.indexer --port 8090 [--threads 4] [--block-size 16 --model-name my-model --tenant-id default --workers "1=tcp://host:5557,2:1=tcp://host:5558"] [--peers "http://peer1:8090,http://peer2:8091"] python -m dynamo.indexer --port 8090 [--threads 4] [--block-size 16 --model-name my-model --tenant-id default --workers "1=tcp://host:5557,2:1=tcp://host:5558"] [--peers "http://peer1:8090,http://peer2:8091"]
``` ```
### Dynamo runtime mode
```bash
python -m dynamo.indexer --dynamo-runtime --namespace default --component-name kv-indexer --worker-component backend --port 8090 [--threads 4]
```
In runtime mode, workers are discovered automatically via MDC. The `--workers` flag can still be used to register additional static workers alongside discovered ones.
| Flag | Default | Description | | Flag | Default | Description |
|------|---------|-------------| |------|---------|-------------|
| `--block-size` | (none) | KV cache block size for initial `--workers` (required when `--workers` is set) | | `--block-size` | (none) | KV cache block size for initial `--workers` (required when `--workers` is set) |
...@@ -142,10 +124,6 @@ In runtime mode, workers are discovered automatically via MDC. The `--workers` f ...@@ -142,10 +124,6 @@ In runtime mode, workers are discovered automatically via MDC. The `--workers` f
| `--model-name` | `default` | Model name for initial `--workers` | | `--model-name` | `default` | Model name for initial `--workers` |
| `--tenant-id` | `default` | Tenant ID for initial `--workers` | | `--tenant-id` | `default` | Tenant ID for initial `--workers` |
| `--peers` | (none) | Comma-separated peer indexer URLs for P2P recovery on startup | | `--peers` | (none) | Comma-separated peer indexer URLs for P2P recovery on startup |
| `--dynamo-runtime` | `false` | Enable Dynamo runtime integration (requires `kv-indexer-runtime`) |
| `--namespace` | `default` | Dynamo namespace to register the indexer component under |
| `--component-name` | `kv-indexer` | Component name for this indexer in the Dynamo runtime |
| `--worker-component` | `backend` | Component name that workers register under for event-plane subscription |
### Shared Startup Gate ### Shared Startup Gate
...@@ -165,7 +143,7 @@ curl http://localhost:8090/health ...@@ -165,7 +143,7 @@ curl http://localhost:8090/health
### `GET /metrics` — Prometheus metrics ### `GET /metrics` — Prometheus metrics
Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` or `kv-indexer-runtime` feature. Returns metrics in Prometheus text exposition format. Available when the Python bindings are built with the `kv-indexer-metrics` feature.
```bash ```bash
curl http://localhost:8090/metrics curl http://localhost:8090/metrics
...@@ -400,38 +378,9 @@ If no `replay_endpoint` is configured, gaps are logged as warnings but not recov ...@@ -400,38 +378,9 @@ If no `replay_endpoint` is configured, gaps are logged as warnings but not recov
The sequence counter (`last_seq`) persists across unregister/register cycles, so re-registering a worker after a gap will trigger replay on the first batch received by the new listener. The sequence counter (`last_seq`) persists across unregister/register cycles, so re-registering a worker after a gap will trigger replay on the first batch received by the new listener.
## Dynamo Runtime Mode
When started with `--dynamo-runtime`, the indexer integrates with the Dynamo distributed runtime:
### Worker Discovery
The indexer watches MDC (Model Discovery Catalog) for worker additions and removals. When a worker registers with MDC, the indexer automatically creates an indexer for its model and block size. Workers discovered via MDC are tracked separately from those registered via `--workers` or the `/register` HTTP API; a worker cannot be registered through both paths simultaneously.
### Event Plane Subscription
Instead of connecting directly to ZMQ PUB sockets on each worker, the indexer subscribes to KV events through the Dynamo event plane. The transport (NATS or ZMQ) is determined by the `DYNAMO_EVENT_TRANSPORT` environment variable. Events are routed to the appropriate indexer based on the worker ID.
### Request Plane Query Endpoint
The indexer registers a query endpoint on the Dynamo request plane, allowing frontends to send `IndexerQueryRequest` messages containing a model name, namespace, and block hashes. The indexer looks up the appropriate radix tree and returns overlap scores. This enables frontends to use a remote indexer for KV-aware routing without direct HTTP access.
### Example
```bash
# Start the indexer with runtime integration
python -m dynamo.indexer --dynamo-runtime \
--namespace my-namespace \
--component-name kv-indexer \
--worker-component backend \
--port 8090 --threads 4
```
The HTTP API remains fully available in runtime mode. Static workers can be added via `--workers` alongside discovered workers.
## Limitations ## Limitations
- **Standalone mode is ZMQ only**: In standalone mode, workers must publish KV events via ZMQ PUB sockets. Build with `kv-indexer-runtime` and use `--dynamo-runtime` to receive events via the event plane (NATS or ZMQ). - **Standalone mode is ZMQ only**: Workers must publish KV events via ZMQ PUB sockets.
- **No routing logic**: The indexer only maintains the radix tree and answers queries. It does not track active blocks, manage request lifecycle, or perform worker selection. - **No routing logic**: The indexer only maintains the radix tree and answers queries. It does not track active blocks, manage request lifecycle, or perform worker selection.
## Architecture ## Architecture
...@@ -471,62 +420,6 @@ graph TD ...@@ -471,62 +420,6 @@ graph TD
style CLIENT fill:#fff3e0,stroke:#333,color:#333 style CLIENT fill:#fff3e0,stroke:#333,color:#333
``` ```
### Dynamo Runtime Mode
```mermaid
graph TD
subgraph Workers
W1[Worker 1]
W2[Worker 2]
end
subgraph "Dynamo Runtime"
MDC[MDC Discovery]
EP[Event Plane<br/>NATS / ZMQ]
RP[Request Plane]
end
subgraph "Standalone Indexer"
DISC[Discovery Watcher]
SUB[Event Subscriber]
REG[Worker Registry]
IDX["Indexer Map<br/>(model, tenant) → Radix Tree"]
QE[Query Endpoint]
HTTP[HTTP API<br/>/query /dump /register /metrics]
end
FRONTEND[Frontend / Router]
CLIENT[External Client]
W1 -->|register| MDC
W2 -->|register| MDC
MDC -->|added/removed| DISC
DISC -->|add/remove workers| REG
W1 -->|KV events| EP
W2 -->|KV events| EP
EP -->|RouterEvent| SUB
SUB -->|apply events| IDX
FRONTEND -->|IndexerQueryRequest| RP
RP --> QE
QE -->|query| IDX
CLIENT -->|POST /query, GET /dump| HTTP
HTTP -->|query| IDX
style W1 fill:#f3e5f5,stroke:#333,color:#333
style W2 fill:#f3e5f5,stroke:#333,color:#333
style MDC fill:#e3f2fd,stroke:#333,color:#333
style EP fill:#e3f2fd,stroke:#333,color:#333
style RP fill:#e3f2fd,stroke:#333,color:#333
style IDX fill:#2e8b57,stroke:#333,color:#fff
style SUB fill:#2e8b57,stroke:#333,color:#fff
style DISC fill:#2e8b57,stroke:#333,color:#fff
style REG fill:#2e8b57,stroke:#333,color:#fff
style QE fill:#2e8b57,stroke:#333,color:#fff
style HTTP fill:#2e8b57,stroke:#333,color:#fff
style FRONTEND fill:#fff3e0,stroke:#333,color:#333
style CLIENT fill:#fff3e0,stroke:#333,color:#333
```
### P2P Recovery Flow ### P2P Recovery Flow
```mermaid ```mermaid
......
...@@ -25,7 +25,6 @@ crate-type = ["cdylib", "rlib"] ...@@ -25,7 +25,6 @@ crate-type = ["cdylib", "rlib"]
default = [] default = []
media-ffmpeg = ["dynamo-llm/media-ffmpeg"] media-ffmpeg = ["dynamo-llm/media-ffmpeg"]
kv-indexer = ["dep:clap", "dep:tracing-subscriber"] kv-indexer = ["dep:clap", "dep:tracing-subscriber"]
kv-indexer-runtime = ["kv-indexer", "dynamo-kv-router/indexer-runtime"]
kv-indexer-metrics = ["kv-indexer", "dynamo-kv-router/metrics"] kv-indexer-metrics = ["kv-indexer", "dynamo-kv-router/metrics"]
nvtx = ["dynamo-runtime/nvtx"] nvtx = ["dynamo-runtime/nvtx"]
......
...@@ -126,7 +126,7 @@ impl AicPerfConfig { ...@@ -126,7 +126,7 @@ impl AicPerfConfig {
#[pymethods] #[pymethods]
impl KvRouterConfig { impl KvRouterConfig {
#[new] #[new]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", remote_indexer_component=None))] #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", use_remote_indexer=false, serve_indexer=false))]
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn new( fn new(
overlap_score_weight: f64, overlap_score_weight: f64,
...@@ -147,7 +147,8 @@ impl KvRouterConfig { ...@@ -147,7 +147,8 @@ impl KvRouterConfig {
router_queue_threshold: Option<f64>, router_queue_threshold: Option<f64>,
router_event_threads: u32, router_event_threads: u32,
router_queue_policy: &str, router_queue_policy: &str,
remote_indexer_component: Option<String>, use_remote_indexer: bool,
serve_indexer: bool,
) -> Self { ) -> Self {
KvRouterConfig { KvRouterConfig {
inner: RsKvRouterConfig { inner: RsKvRouterConfig {
...@@ -176,7 +177,8 @@ impl KvRouterConfig { ...@@ -176,7 +177,8 @@ impl KvRouterConfig {
router_queue_policy: router_queue_policy.parse().unwrap_or_else(|_| { router_queue_policy: router_queue_policy.parse().unwrap_or_else(|_| {
panic!("invalid router_queue_policy: {router_queue_policy:?}") panic!("invalid router_queue_policy: {router_queue_policy:?}")
}), }),
remote_indexer_component, use_remote_indexer,
serve_indexer,
}, },
} }
} }
......
...@@ -16,8 +16,6 @@ use clap::Parser; ...@@ -16,8 +16,6 @@ use clap::Parser;
use dynamo_kv_router::config::{KvRouterConfig, RouterConfigOverride}; use dynamo_kv_router::config::{KvRouterConfig, RouterConfigOverride};
use dynamo_kv_router::protocols::compute_block_hash_for_seq; use dynamo_kv_router::protocols::compute_block_hash_for_seq;
use dynamo_kv_router::protocols::*; use dynamo_kv_router::protocols::*;
#[cfg(feature = "kv-indexer-runtime")]
use dynamo_kv_router::standalone_indexer::RuntimeConfig;
#[cfg(feature = "kv-indexer")] #[cfg(feature = "kv-indexer")]
use dynamo_kv_router::standalone_indexer::{self, IndexerConfig}; use dynamo_kv_router::standalone_indexer::{self, IndexerConfig};
use rs::pipeline::{AsyncEngine, SingleIn}; use rs::pipeline::{AsyncEngine, SingleIn};
...@@ -71,26 +69,6 @@ struct KvIndexerCli { ...@@ -71,26 +69,6 @@ struct KvIndexerCli {
/// Comma-separated peer URLs for P2P recovery (e.g. "http://host1:8090,http://host2:8091") /// Comma-separated peer URLs for P2P recovery (e.g. "http://host1:8090,http://host2:8091")
#[arg(long)] #[arg(long)]
peers: Option<String>, peers: Option<String>,
/// Enable Dynamo runtime integration (discovery, event plane, request plane).
#[cfg(feature = "kv-indexer-runtime")]
#[arg(long)]
dynamo_runtime: bool,
/// Dynamo namespace to register the indexer component under.
#[cfg(feature = "kv-indexer-runtime")]
#[arg(long, default_value = "default")]
namespace: String,
/// Component name for this indexer in the Dynamo runtime.
#[cfg(feature = "kv-indexer-runtime")]
#[arg(long, default_value = "kv-indexer")]
component_name: String,
/// Component name that workers register under.
#[cfg(feature = "kv-indexer-runtime")]
#[arg(long, default_value = "backend")]
worker_component: String,
} }
pub fn run_kv_indexer_cli<I, T>(args: I) -> anyhow::Result<()> pub fn run_kv_indexer_cli<I, T>(args: I) -> anyhow::Result<()>
...@@ -105,31 +83,6 @@ where ...@@ -105,31 +83,6 @@ where
.chain(args.into_iter().map(Into::into)), .chain(args.into_iter().map(Into::into)),
)?; )?;
#[cfg(feature = "kv-indexer-runtime")]
if cli.dynamo_runtime {
dynamo_runtime::logging::init();
let worker = dynamo_runtime::Worker::from_settings()?;
return worker.execute(move |runtime| {
standalone_indexer::run_with_runtime(
runtime,
IndexerConfig {
block_size: cli.block_size,
port: cli.port,
threads: cli.threads,
workers: cli.workers,
model_name: cli.model_name,
tenant_id: cli.tenant_id,
peers: cli.peers,
},
RuntimeConfig {
namespace: cli.namespace,
component_name: cli.component_name,
worker_component: cli.worker_component,
},
)
});
}
init_standalone_logging(); init_standalone_logging();
let rt = tokio::runtime::Runtime::new()?; let rt = tokio::runtime::Runtime::new()?;
...@@ -732,11 +685,11 @@ async fn create_kv_router_from_endpoint( ...@@ -732,11 +685,11 @@ async fn create_kv_router_from_endpoint(
llm_rs::discovery::WORKER_TYPE_DECODE llm_rs::discovery::WORKER_TYPE_DECODE
}; };
// Query discovery once so we can derive both model_name (for remote indexer) // Query discovery once so we can derive both model_name (for remote/served indexer)
// and Eagle routing semantics from the model card. // and Eagle routing semantics from the model card.
let needs_model_name = kv_router_config let needs_model_name = kv_router_config
.as_ref() .as_ref()
.map(|cfg| cfg.remote_indexer_component.is_some()) .map(|cfg| cfg.use_remote_indexer || cfg.serve_indexer)
.unwrap_or(false); .unwrap_or(false);
let (model_name, enable_eagle) = { let (model_name, enable_eagle) = {
let discovery = endpoint.inner.component().drt().discovery(); let discovery = endpoint.inner.component().drt().discovery();
......
...@@ -287,6 +287,10 @@ class router: ...@@ -287,6 +287,10 @@ class router:
# Total number of requests processed by the router # Total number of requests processed by the router
REQUESTS_TOTAL = "router_requests_total" REQUESTS_TOTAL = "router_requests_total"
# Total number of remote indexer overlap queries that failed
REMOTE_INDEXER_QUERY_FAILURES_TOTAL = "router_remote_indexer_query_failures_total"
# Total number of remote indexer routing-decision writes that failed
REMOTE_INDEXER_WRITE_FAILURES_TOTAL = "router_remote_indexer_write_failures_total"
# Time to first token observed at the router (seconds) # Time to first token observed at the router (seconds)
TIME_TO_FIRST_TOKEN_SECONDS = "router_time_to_first_token_seconds" TIME_TO_FIRST_TOKEN_SECONDS = "router_time_to_first_token_seconds"
# Average inter-token latency observed at the router (seconds) # Average inter-token latency observed at the router (seconds)
......
...@@ -86,7 +86,7 @@ def _router_config_payload(): ...@@ -86,7 +86,7 @@ def _router_config_payload():
"router_prune_target_ratio": 0.8, "router_prune_target_ratio": 0.8,
"router_enable_cache_control": False, "router_enable_cache_control": False,
"skip_initial_worker_wait": False, "skip_initial_worker_wait": False,
"remote_indexer_component": None, "use_remote_indexer": False,
} }
......
...@@ -18,7 +18,6 @@ metrics = ["dep:prometheus"] ...@@ -18,7 +18,6 @@ metrics = ["dep:prometheus"]
runtime-protocols = ["dep:dynamo-runtime"] runtime-protocols = ["dep:dynamo-runtime"]
bench = [] bench = []
standalone-indexer = ["dep:axum", "dep:serde_json", "dep:reqwest", "dep:zmq"] standalone-indexer = ["dep:axum", "dep:serde_json", "dep:reqwest", "dep:zmq"]
indexer-runtime = ["metrics", "runtime-protocols", "standalone-indexer"]
[dependencies] [dependencies]
# repo # repo
......
...@@ -510,7 +510,7 @@ impl KvIndexerInterface for KvIndexer { ...@@ -510,7 +510,7 @@ impl KvIndexerInterface for KvIndexer {
let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec(); let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec();
let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec(); let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec();
self.process_routing_decision_internal(worker, local_hashes, sequence_hashes) self.process_routing_decision_with_hashes(worker, local_hashes, sequence_hashes)
.await .await
} }
async fn flush(&self) -> usize { async fn flush(&self) -> usize {
...@@ -526,8 +526,8 @@ impl KvIndexerInterface for KvIndexer { ...@@ -526,8 +526,8 @@ impl KvIndexerInterface for KvIndexer {
} }
impl KvIndexer { impl KvIndexer {
/// Internal method to process a routing decision with pre-computed hashes. /// Process a routing decision with pre-computed hashes.
async fn process_routing_decision_internal( pub async fn process_routing_decision_with_hashes(
&self, &self,
worker: WorkerWithDpRank, worker: WorkerWithDpRank,
local_hashes: Vec<LocalBlockHash>, local_hashes: Vec<LocalBlockHash>,
......
...@@ -353,6 +353,21 @@ impl KvIndexerSharded { ...@@ -353,6 +353,21 @@ impl KvIndexerSharded {
) -> Self { ) -> Self {
Self::new_with_frequency(token, num_shards, None, kv_block_size, metrics, None) Self::new_with_frequency(token, num_shards, None, kv_block_size, metrics, None)
} }
fn shard_for_worker(&self, worker_id: WorkerId) -> usize {
*self.worker_assignments.entry(worker_id).or_insert_with(|| {
let worker_counts = self.worker_counts.lock().unwrap();
let selected_shard = worker_counts
.iter()
.enumerate()
.min_by_key(|&(_, value)| value)
.unwrap()
.0;
drop(worker_counts);
self.worker_counts.lock().unwrap()[selected_shard] += 1;
selected_shard
})
}
} }
#[async_trait] #[async_trait]
...@@ -439,26 +454,8 @@ impl KvIndexerInterface for KvIndexerSharded { ...@@ -439,26 +454,8 @@ impl KvIndexerInterface for KvIndexerSharded {
} }
async fn apply_event(&self, event: RouterEvent) { async fn apply_event(&self, event: RouterEvent) {
let shard = self let shard = self.shard_for_worker(event.worker_id);
.worker_assignments self.event_tx[shard].send(event).await.unwrap();
.entry(event.worker_id)
.or_insert_with(|| {
// Get the shard with the smallest amount of workers.
let worker_counts = self.worker_counts.lock().unwrap();
let selected_shard = worker_counts
.iter()
.enumerate()
.min_by_key(|&(_, value)| value)
.unwrap()
.0;
drop(worker_counts);
// Increment the count for this shard
self.worker_counts.lock().unwrap()[selected_shard] += 1;
selected_shard
});
self.event_tx[*shard].send(event).await.unwrap();
} }
async fn remove_worker(&self, worker: WorkerId) { async fn remove_worker(&self, worker: WorkerId) {
...@@ -525,7 +522,7 @@ impl KvIndexerInterface for KvIndexerSharded { ...@@ -525,7 +522,7 @@ impl KvIndexerInterface for KvIndexerSharded {
let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec(); let local_hashes = tokens_with_hashes.get_or_compute_block_hashes().to_vec();
let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec(); let sequence_hashes = tokens_with_hashes.get_or_compute_seq_hashes().to_vec();
self.process_routing_decision_internal(worker, local_hashes, sequence_hashes) self.process_routing_decision_with_hashes(worker, local_hashes, sequence_hashes)
.await .await
} }
...@@ -550,19 +547,14 @@ impl KvIndexerInterface for KvIndexerSharded { ...@@ -550,19 +547,14 @@ impl KvIndexerInterface for KvIndexerSharded {
} }
impl KvIndexerSharded { impl KvIndexerSharded {
/// Internal method to process a routing decision with pre-computed hashes. /// Process a routing decision with pre-computed hashes.
async fn process_routing_decision_internal( pub async fn process_routing_decision_with_hashes(
&self, &self,
worker: WorkerWithDpRank, worker: WorkerWithDpRank,
local_hashes: Vec<LocalBlockHash>, local_hashes: Vec<LocalBlockHash>,
sequence_hashes: Vec<SequenceHash>, sequence_hashes: Vec<SequenceHash>,
) -> Result<(), KvRouterError> { ) -> Result<(), KvRouterError> {
// Route to the appropriate shard based on worker assignment let shard_idx = self.shard_for_worker(worker.worker_id);
let shard_idx = self
.worker_assignments
.get(&worker.worker_id)
.map(|shard_idx| *shard_idx)
.unwrap_or_default();
self.routing_tx[shard_idx] self.routing_tx[shard_idx]
.send(RoutingDecisionRequest { .send(RoutingDecisionRequest {
......
...@@ -13,6 +13,7 @@ use super::concurrent_radix_tree::ConcurrentRadixTree; ...@@ -13,6 +13,7 @@ use super::concurrent_radix_tree::ConcurrentRadixTree;
use super::concurrent_radix_tree_compressed::ConcurrentRadixTreeCompressed; use super::concurrent_radix_tree_compressed::ConcurrentRadixTreeCompressed;
use super::positional::PositionalIndexer; use super::positional::PositionalIndexer;
use super::*; use super::*;
use crate::indexer::pruning::PruneConfig;
use crate::protocols::*; use crate::protocols::*;
use crate::test_utils::{remove_event, router_event, stored_blocks_with_sequence_hashes}; use crate::test_utils::{remove_event, router_event, stored_blocks_with_sequence_hashes};
...@@ -1889,6 +1890,37 @@ fn make_tree_indexer_with_frequency( ...@@ -1889,6 +1890,37 @@ fn make_tree_indexer_with_frequency(
} }
} }
#[tokio::test]
async fn test_sharded_routing_decision_assigns_first_seen_worker() {
let token = CancellationToken::new();
let metrics = Arc::new(KvIndexerMetrics::new_unregistered());
let index = KvIndexerSharded::new_with_frequency(
token,
4,
Some(Duration::from_secs(60)),
32,
metrics,
Some(PruneConfig::default()),
);
let worker = WorkerWithDpRank::new(42, 0);
let local_hashes = vec![LocalBlockHash(11), LocalBlockHash(22)];
let sequence_hashes = compute_seq_hash_for_block(&local_hashes);
index
.process_routing_decision_with_hashes(worker, local_hashes.clone(), sequence_hashes)
.await
.unwrap();
flush_and_settle(&index).await;
assert_score(&index, &[11, 22], worker, 2).await;
index.remove_worker(worker.worker_id).await;
flush_and_settle(&index).await;
let scores = query_scores(&index, &[11, 22]).await;
assert!(!scores.scores.contains_key(&worker));
}
mod tree_specific_tests { mod tree_specific_tests {
use super::*; use super::*;
use rstest_reuse::apply; use rstest_reuse::apply;
......
...@@ -110,14 +110,14 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for WorkerKvQueryRespons ...@@ -110,14 +110,14 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for WorkerKvQueryRespons
/// Endpoint name for the standalone KV indexer query service. /// Endpoint name for the standalone KV indexer query service.
pub const KV_INDEXER_QUERY_ENDPOINT: &str = "kv_indexer_query"; pub const KV_INDEXER_QUERY_ENDPOINT: &str = "kv_indexer_query";
/// Endpoint name for recording approximate-mode routing decisions on a remote indexer.
pub const KV_INDEXER_RECORD_ROUTING_DECISION_ENDPOINT: &str = "kv_indexer_record_routing_decision";
/// Request to query the standalone KV indexer for overlap scores. /// Request to query a served KV indexer for overlap scores.
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IndexerQueryRequest { pub struct IndexerQueryRequest {
/// Model name to query the indexer for. /// Model name to query the indexer for.
pub model_name: String, pub model_name: String,
/// Dynamo namespace (used as tenant_id for indexer lookup).
pub namespace: String,
/// Block hashes to find matches for in the radix tree. /// Block hashes to find matches for in the radix tree.
pub block_hashes: Vec<LocalBlockHash>, pub block_hashes: Vec<LocalBlockHash>,
} }
...@@ -153,7 +153,7 @@ impl From<WireOverlapScores> for OverlapScores { ...@@ -153,7 +153,7 @@ impl From<WireOverlapScores> for OverlapScores {
} }
} }
/// Response from the standalone KV indexer. /// Response from a served KV indexer query.
#[derive(Serialize, Deserialize, Debug, Clone)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub enum IndexerQueryResponse { pub enum IndexerQueryResponse {
/// Overlap scores per worker. /// Overlap scores per worker.
...@@ -191,6 +191,57 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for IndexerQueryResponse ...@@ -191,6 +191,57 @@ impl dynamo_runtime::protocols::maybe_error::MaybeError for IndexerQueryResponse
} }
} }
/// Request to record a routing decision on a served approximate-mode indexer.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IndexerRecordRoutingDecisionRequest {
/// Model name to update.
pub model_name: String,
/// Selected worker for this routing decision.
pub worker: WorkerWithDpRank,
/// Locally-computed block hashes for the routed request.
pub local_hashes: Vec<LocalBlockHash>,
/// Locally-computed rolling sequence hashes for the routed request.
pub sequence_hashes: Vec<SequenceHash>,
}
/// Response from a served approximate-mode routing-decision endpoint.
#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum IndexerRecordRoutingDecisionResponse {
Recorded,
Error(String),
}
impl MaybeError for IndexerRecordRoutingDecisionResponse {
fn from_err(err: impl std::error::Error + 'static) -> Self {
IndexerRecordRoutingDecisionResponse::Error(err.to_string())
}
fn err(&self) -> Option<Box<dyn std::error::Error + Send + Sync>> {
match self {
IndexerRecordRoutingDecisionResponse::Error(msg) => {
Some(Box::new(std::io::Error::other(msg.clone())))
}
_ => None,
}
}
}
#[cfg(feature = "runtime-protocols")]
impl dynamo_runtime::protocols::maybe_error::MaybeError for IndexerRecordRoutingDecisionResponse {
fn from_err(err: impl std::error::Error + 'static) -> Self {
IndexerRecordRoutingDecisionResponse::Error(err.to_string())
}
fn err(&self) -> Option<dynamo_runtime::error::DynamoError> {
match self {
IndexerRecordRoutingDecisionResponse::Error(msg) => {
Some(dynamo_runtime::error::DynamoError::msg(msg.clone()))
}
_ => None,
}
}
}
/// A request to find matches in the Radix Tree. /// A request to find matches in the Radix Tree.
pub struct MatchRequest { pub struct MatchRequest {
/// A vector of `LocalBlockHash` representing the sequence to match. /// A vector of `LocalBlockHash` representing the sequence to match.
......
...@@ -204,12 +204,16 @@ pub struct KvRouterConfig { ...@@ -204,12 +204,16 @@ pub struct KvRouterConfig {
/// "wspt": weighted shortest processing time (Smith's rule) — optimizes average TTFT. /// "wspt": weighted shortest processing time (Smith's rule) — optimizes average TTFT.
pub router_queue_policy: RouterQueuePolicy, pub router_queue_policy: RouterQueuePolicy,
/// Component name of a standalone KV indexer to use for overlap scoring. /// Whether to query a remote KV indexer served from the worker component
/// When set, the router creates a `Remote` indexer that queries the standalone /// instead of maintaining a local radix tree for overlap scoring.
/// indexer via the request plane instead of maintaining a local radix tree.
/// The standalone indexer handles its own event subscription and discovery.
#[serde(default)] #[serde(default)]
pub remote_indexer_component: Option<String>, pub use_remote_indexer: bool,
/// Whether this router should serve its local indexer from the worker component.
/// This enables other routers/frontends in the same namespace to query
/// overlap scores remotely over the request plane by component + endpoint.
#[serde(default)]
pub serve_indexer: bool,
} }
impl Default for KvRouterConfig { impl Default for KvRouterConfig {
...@@ -234,7 +238,8 @@ impl Default for KvRouterConfig { ...@@ -234,7 +238,8 @@ impl Default for KvRouterConfig {
router_event_threads: 4, router_event_threads: 4,
skip_initial_worker_wait: false, skip_initial_worker_wait: false,
router_queue_policy: RouterQueuePolicy::default(), router_queue_policy: RouterQueuePolicy::default(),
remote_indexer_component: None, use_remote_indexer: false,
serve_indexer: false,
} }
} }
} }
...@@ -268,6 +273,16 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr ...@@ -268,6 +273,16 @@ fn validate_kv_router_config(config: &KvRouterConfig) -> Result<(), ValidationEr
"router_prefill_load_model currently requires router_queue_policy='fcfs'", "router_prefill_load_model currently requires router_queue_policy='fcfs'",
)); ));
} }
if config.use_remote_indexer && config.serve_indexer {
return Err(ValidationError::new(
"use_remote_indexer and serve_indexer are mutually exclusive",
));
}
if config.serve_indexer && config.overlap_score_weight == 0.0 {
return Err(ValidationError::new(
"serve_indexer requires overlap_score_weight > 0",
));
}
Ok(()) Ok(())
} }
......
...@@ -6,8 +6,6 @@ pub mod listener; ...@@ -6,8 +6,6 @@ pub mod listener;
pub mod metrics; pub mod metrics;
pub mod recovery; pub mod recovery;
pub mod registry; pub mod registry;
#[cfg(feature = "indexer-runtime")]
pub mod runtime;
pub mod server; pub mod server;
mod zmq; mod zmq;
...@@ -31,13 +29,6 @@ pub struct IndexerConfig { ...@@ -31,13 +29,6 @@ pub struct IndexerConfig {
pub peers: Option<String>, pub peers: Option<String>,
} }
#[cfg(feature = "indexer-runtime")]
pub struct RuntimeConfig {
pub namespace: String,
pub component_name: String,
pub worker_component: String,
}
pub(super) fn validate_zmq_endpoint(endpoint: &str) -> anyhow::Result<()> { pub(super) fn validate_zmq_endpoint(endpoint: &str) -> anyhow::Result<()> {
let (scheme, address) = endpoint let (scheme, address) = endpoint
.split_once("://") .split_once("://")
...@@ -155,81 +146,6 @@ pub async fn run_server(config: IndexerConfig) -> anyhow::Result<()> { ...@@ -155,81 +146,6 @@ pub async fn run_server(config: IndexerConfig) -> anyhow::Result<()> {
run_common(&config, &registry, cancel_token).await run_common(&config, &registry, cancel_token).await
} }
#[cfg(feature = "indexer-runtime")]
pub async fn run_with_runtime(
runtime: dynamo_runtime::Runtime,
config: IndexerConfig,
runtime_config: RuntimeConfig,
) -> anyhow::Result<()> {
use dynamo_runtime::{
DistributedRuntime,
pipeline::{ManyOut, SingleIn, network::Ingress},
};
use crate::indexer::{IndexerQueryRequest, IndexerQueryResponse, KV_INDEXER_QUERY_ENDPOINT};
let distributed_runtime = DistributedRuntime::from_settings(runtime).await?;
let cancel_token = distributed_runtime.primary_token();
let component = distributed_runtime
.namespace(&runtime_config.namespace)?
.component(&runtime_config.component_name)?;
tracing::info!(
namespace = %runtime_config.namespace,
component = %runtime_config.component_name,
block_size = ?config.block_size,
port = config.port,
threads = config.threads,
model_name = %config.model_name,
tenant_id = %config.tenant_id,
worker_component = %runtime_config.worker_component,
num_peers = config.peers.as_ref().map(|p| p.split(',').count()).unwrap_or(0),
"Starting standalone KV cache indexer (Dynamo runtime mode)"
);
let registry = Arc::new(WorkerRegistry::new(config.threads));
let engine = Arc::new(runtime::query_engine::IndexerQueryEngine {
registry: registry.clone(),
});
let ingress =
Ingress::<SingleIn<IndexerQueryRequest>, ManyOut<IndexerQueryResponse>>::for_engine(
engine,
)?;
let query_endpoint = component
.endpoint(KV_INDEXER_QUERY_ENDPOINT)
.endpoint_builder()
.handler(ingress)
.graceful_shutdown(true);
distributed_runtime.runtime().secondary().spawn(async move {
if let Err(err) = query_endpoint.start().await {
tracing::error!(error = %err, "Query endpoint failed");
}
});
tracing::info!(
endpoint = KV_INDEXER_QUERY_ENDPOINT,
"Query endpoint registered"
);
runtime::discovery::spawn_discovery_watcher(
&distributed_runtime,
registry.clone(),
cancel_token.clone(),
)
.await?;
runtime::subscriber::spawn_event_subscriber(
&distributed_runtime,
&runtime_config.namespace,
&runtime_config.worker_component,
registry.clone(),
cancel_token.clone(),
)
.await?;
run_common(&config, &registry, cancel_token).await
}
async fn wait_for_min_initial_workers( async fn wait_for_min_initial_workers(
registry: &WorkerRegistry, registry: &WorkerRegistry,
cancel_token: &CancellationToken, cancel_token: &CancellationToken,
......
...@@ -314,8 +314,6 @@ pub struct WorkerRegistry { ...@@ -314,8 +314,6 @@ pub struct WorkerRegistry {
indexers: DashMap<IndexerKey, IndexerEntry>, indexers: DashMap<IndexerKey, IndexerEntry>,
peers: DashMap<String, ()>, peers: DashMap<String, ()>,
watermarks: DashMap<(WorkerId, u32), Arc<AtomicU64>>, watermarks: DashMap<(WorkerId, u32), Arc<AtomicU64>>,
#[cfg(feature = "indexer-runtime")]
discovered_workers: DashMap<WorkerId, IndexerKey>,
num_threads: usize, num_threads: usize,
ready_tx: watch::Sender<bool>, ready_tx: watch::Sender<bool>,
ready_rx: watch::Receiver<bool>, ready_rx: watch::Receiver<bool>,
...@@ -329,8 +327,6 @@ impl WorkerRegistry { ...@@ -329,8 +327,6 @@ impl WorkerRegistry {
indexers: DashMap::new(), indexers: DashMap::new(),
peers: DashMap::new(), peers: DashMap::new(),
watermarks: DashMap::new(), watermarks: DashMap::new(),
#[cfg(feature = "indexer-runtime")]
discovered_workers: DashMap::new(),
num_threads, num_threads,
ready_tx, ready_tx,
ready_rx, ready_rx,
...@@ -360,16 +356,7 @@ impl WorkerRegistry { ...@@ -360,16 +356,7 @@ impl WorkerRegistry {
#[cfg(feature = "metrics")] #[cfg(feature = "metrics")]
pub fn refresh_metrics(&self) { pub fn refresh_metrics(&self) {
let models = self.indexers.len(); let models = self.indexers.len();
let workers = self.workers.len() + { let workers = self.workers.len();
#[cfg(feature = "indexer-runtime")]
{
self.discovered_workers.len()
}
#[cfg(not(feature = "indexer-runtime"))]
{
0
}
};
let mut listener_counts = [0_i64; 4]; let mut listener_counts = [0_i64; 4];
for entry in self.workers.iter() { for entry in self.workers.iter() {
...@@ -392,14 +379,6 @@ impl WorkerRegistry { ...@@ -392,14 +379,6 @@ impl WorkerRegistry {
block_size: u32, block_size: u32,
replay_endpoint: Option<String>, replay_endpoint: Option<String>,
) -> Result<()> { ) -> Result<()> {
#[cfg(feature = "indexer-runtime")]
if self.discovered_workers.contains_key(&instance_id) {
bail!(
"instance {instance_id} is already registered via discovery; \
use the Dynamo runtime to manage it"
);
}
let key = IndexerKey { let key = IndexerKey {
model_name, model_name,
tenant_id, tenant_id,
...@@ -495,24 +474,10 @@ impl WorkerRegistry { ...@@ -495,24 +474,10 @@ impl WorkerRegistry {
entry.key.tenant_id entry.key.tenant_id
); );
} }
} else {
#[cfg(feature = "indexer-runtime")]
if let Some(discovered_key) = self.discovered_workers.get(&instance_id) {
if discovered_key.value() != &key {
bail!(
"instance {instance_id} is registered for model={} tenant={}",
discovered_key.value().model_name,
discovered_key.value().tenant_id
);
}
} else { } else {
bail!("instance {instance_id} not found"); bail!("instance {instance_id} not found");
} }
#[cfg(not(feature = "indexer-runtime"))]
bail!("instance {instance_id} not found");
}
if let Some((_, entry)) = self.workers.remove(&instance_id) { if let Some((_, entry)) = self.workers.remove(&instance_id) {
for record in entry.listeners.values() { for record in entry.listeners.values() {
if let Some(cancel_token) = record.take_cancel() { if let Some(cancel_token) = record.take_cancel() {
...@@ -522,11 +487,6 @@ impl WorkerRegistry { ...@@ -522,11 +487,6 @@ impl WorkerRegistry {
for &dp_rank in entry.listeners.keys() { for &dp_rank in entry.listeners.keys() {
self.watermarks.remove(&(instance_id, dp_rank)); self.watermarks.remove(&(instance_id, dp_rank));
} }
} else {
#[cfg(feature = "indexer-runtime")]
{
self.discovered_workers.remove(&instance_id);
}
} }
if let Some(ie) = self.indexers.get(&key) { if let Some(ie) = self.indexers.get(&key) {
...@@ -602,21 +562,6 @@ impl WorkerRegistry { ...@@ -602,21 +562,6 @@ impl WorkerRegistry {
} }
entry.key.clone() entry.key.clone()
} else { } else {
#[cfg(feature = "indexer-runtime")]
if let Some(discovered_key) = self.discovered_workers.get(&instance_id) {
if discovered_key.value().model_name != model_name {
bail!(
"instance {instance_id} is registered for model={} tenant={}",
discovered_key.value().model_name,
discovered_key.value().tenant_id
);
}
discovered_key.value().clone()
} else {
bail!("instance {instance_id} not found");
}
#[cfg(not(feature = "indexer-runtime"))]
bail!("instance {instance_id} not found"); bail!("instance {instance_id} not found");
}; };
...@@ -629,11 +574,6 @@ impl WorkerRegistry { ...@@ -629,11 +574,6 @@ impl WorkerRegistry {
for &dp_rank in entry.listeners.keys() { for &dp_rank in entry.listeners.keys() {
self.watermarks.remove(&(instance_id, dp_rank)); self.watermarks.remove(&(instance_id, dp_rank));
} }
} else {
#[cfg(feature = "indexer-runtime")]
{
self.discovered_workers.remove(&instance_id);
}
} }
if let Some(ie) = self.indexers.get(&key) { if let Some(ie) = self.indexers.get(&key) {
...@@ -656,11 +596,6 @@ impl WorkerRegistry { ...@@ -656,11 +596,6 @@ impl WorkerRegistry {
}, },
)? )?
} else { } else {
#[cfg(feature = "indexer-runtime")]
if self.discovered_workers.contains_key(&instance_id) {
return Err(ListenerControlError::DiscoveryManaged { instance_id });
}
return Err(ListenerControlError::WorkerNotFound { instance_id }); return Err(ListenerControlError::WorkerNotFound { instance_id });
}; };
...@@ -683,11 +618,6 @@ impl WorkerRegistry { ...@@ -683,11 +618,6 @@ impl WorkerRegistry {
}, },
)? )?
} else { } else {
#[cfg(feature = "indexer-runtime")]
if self.discovered_workers.contains_key(&instance_id) {
return Err(ListenerControlError::DiscoveryManaged { instance_id });
}
return Err(ListenerControlError::WorkerNotFound { instance_id }); return Err(ListenerControlError::WorkerNotFound { instance_id });
}; };
...@@ -724,21 +654,6 @@ impl WorkerRegistry { ...@@ -724,21 +654,6 @@ impl WorkerRegistry {
}) })
.collect(); .collect();
#[cfg(feature = "indexer-runtime")]
for entry in self.discovered_workers.iter() {
let worker_id = *entry.key();
if self.workers.contains_key(&worker_id) {
continue;
}
result.push(WorkerInfo {
instance_id: worker_id,
source: WorkerSource::Discovery,
status: ListenerStatus::Active,
endpoints: HashMap::new(),
listeners: HashMap::new(),
});
}
result result
} }
...@@ -784,97 +699,6 @@ impl WorkerRegistry { ...@@ -784,97 +699,6 @@ impl WorkerRegistry {
.collect() .collect()
} }
#[cfg(feature = "indexer-runtime")]
pub fn add_worker_from_discovery(
&self,
instance_id: WorkerId,
model_name: String,
tenant_id: String,
block_size: u32,
) -> Result<()> {
if self.workers.contains_key(&instance_id) {
bail!(
"instance {instance_id} is already manually registered; \
cannot add via discovery"
);
}
let key = IndexerKey {
model_name,
tenant_id,
};
if let Some(existing) = self.discovered_workers.get(&instance_id) {
if existing.value() != &key {
bail!(
"instance {instance_id} is already registered for model={} tenant={}",
existing.value().model_name,
existing.value().tenant_id
);
}
return Ok(());
}
let indexer_entry = self.indexers.entry(key.clone()).or_insert_with(|| {
tracing::info!(
model_name = %key.model_name,
tenant_id = %key.tenant_id,
block_size,
"Creating new indexer (discovery)"
);
IndexerEntry {
indexer: create_indexer(block_size, self.num_threads),
block_size,
}
});
if indexer_entry.block_size != block_size {
bail!(
"block_size mismatch for model={} tenant={}: existing={}, requested={}",
key.model_name,
key.tenant_id,
indexer_entry.block_size,
block_size
);
}
drop(indexer_entry);
self.discovered_workers.insert(instance_id, key);
Ok(())
}
#[cfg(feature = "indexer-runtime")]
pub async fn remove_worker_from_discovery(&self, instance_id: WorkerId) {
if let Some((_, key)) = self.discovered_workers.remove(&instance_id) {
if let Some(ie) = self.indexers.get(&key) {
ie.indexer.remove_worker(instance_id).await;
}
self.maybe_remove_indexer(&key);
} else {
tracing::debug!(
instance_id,
"remove_worker_from_discovery: worker not in discovered_workers map"
);
}
}
#[cfg(feature = "indexer-runtime")]
pub fn get_indexer_for_worker(&self, worker_id: WorkerId) -> Option<Indexer> {
if let Some(key) = self.discovered_workers.get(&worker_id)
&& let Some(ie) = self.indexers.get(key.value())
{
return Some(ie.indexer.clone());
}
if let Some(entry) = self.workers.get(&worker_id)
&& let Some(ie) = self.indexers.get(&entry.key)
{
return Some(ie.indexer.clone());
}
None
}
fn spawn_listener( fn spawn_listener(
&self, &self,
instance_id: WorkerId, instance_id: WorkerId,
...@@ -897,15 +721,6 @@ impl WorkerRegistry { ...@@ -897,15 +721,6 @@ impl WorkerRegistry {
return; return;
} }
#[cfg(feature = "indexer-runtime")]
if self
.discovered_workers
.iter()
.any(|entry| entry.value() == key)
{
return;
}
self.indexers.remove(key); self.indexers.remove(key);
} }
} }
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::sync::Arc;
use dynamo_runtime::stream::StreamExt;
use dynamo_runtime::{
DistributedRuntime,
discovery::{
DiscoveryEvent, DiscoveryInstance, DiscoveryInstanceId, DiscoveryQuery, DiscoveryStream,
},
};
use serde::Deserialize;
use tokio_util::sync::CancellationToken;
use crate::standalone_indexer::registry::WorkerRegistry;
#[derive(Deserialize, Debug)]
struct PartialModelCard {
pub display_name: String,
#[serde(default)]
pub kv_cache_block_size: u32,
}
pub async fn spawn_discovery_watcher(
drt: &DistributedRuntime,
registry: Arc<WorkerRegistry>,
cancel_token: CancellationToken,
) -> anyhow::Result<()> {
let discovery = drt.discovery();
let mut stream: DiscoveryStream = discovery
.list_and_watch(DiscoveryQuery::AllModels, Some(cancel_token.clone()))
.await?;
tokio::spawn(async move {
tracing::info!("Discovery watcher started");
while let Some(result) = stream.next().await {
let event = match result {
Ok(event) => event,
Err(err) => {
tracing::error!(%err, "Error in discovery stream");
continue;
}
};
match event {
DiscoveryEvent::Added(instance) => {
let (instance_id, namespace, card) = match &instance {
DiscoveryInstance::Model {
instance_id,
namespace,
..
} => match instance.deserialize_model::<PartialModelCard>() {
Ok(card) => (*instance_id, namespace.clone(), card),
Err(err) => {
tracing::error!(%err, instance_id, "Failed to deserialize model card");
continue;
}
},
_ => {
tracing::debug!("Ignoring non-model discovery instance");
continue;
}
};
let model_name = card.display_name.clone();
let block_size = card.kv_cache_block_size;
let tenant_id = namespace;
if block_size == 0 {
tracing::warn!(
instance_id,
model_name,
"Skipping worker with kv_cache_block_size=0"
);
continue;
}
tracing::info!(
instance_id,
model_name,
tenant_id,
block_size,
"Discovery: adding worker"
);
if let Err(err) = registry.add_worker_from_discovery(
instance_id,
model_name.clone(),
tenant_id,
block_size,
) {
tracing::error!(
instance_id,
model_name,
error = %err,
"Failed to add discovered worker"
);
}
}
DiscoveryEvent::Removed(id) => {
let instance_id = match &id {
DiscoveryInstanceId::Model(mcid) => mcid.instance_id,
_ => {
tracing::debug!("Ignoring non-model discovery removal");
continue;
}
};
tracing::info!(instance_id, "Discovery: removing worker");
registry.remove_worker_from_discovery(instance_id).await;
}
}
}
tracing::info!("Discovery watcher exiting");
});
Ok(())
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment