Unverified Commit 2ec1c3f5 authored by dagil-nvidia's avatar dagil-nvidia Committed by GitHub
Browse files

docs: add Discovery Plane, refactor Event Plane with D2 diagrams (#6229)


Signed-off-by: default avatarDan Gil <dagil@nvidia.com>
Co-authored-by: default avatarCursor <cursoragent@cursor.com>
parent 30750284
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
direction: down
title: "Discovery Plane" {
style.font-size: 36
shape: text
near: top-center
}
components: "Components" {
style.font-size: 32
direction: right
frontend: Frontend {
style.font-size: 24
shape: rectangle
}
router: Router {
style.font-size: 24
shape: rectangle
}
planner: Planner {
style.font-size: 24
shape: rectangle
}
}
discovery: "Discovery Layer" {
style.font-size: 32
direction: right
k8s: "Kubernetes Backend" {
style.font-size: 28
crds: "DynamoWorkerMetadata\nCRDs" {
style.font-size: 22
shape: rectangle
}
eps: "EndpointSlices" {
style.font-size: 22
shape: rectangle
}
api: "K8s API Watch" {
style.font-size: 22
shape: rectangle
}
}
etcd_backend: "etcd Backend (default)" {
style.font-size: 28
store: "etcd" {
style.font-size: 22
shape: cylinder
}
keys: "Key hierarchy\n/services/{ns}/{comp}/{ep}" {
style.font-size: 22
shape: text
}
lease: "Lease-based cleanup\nTTL: 10s" {
style.font-size: 22
shape: text
}
}
}
workers: "Workers" {
style.font-size: 32
direction: right
w1: "Worker 1" {
style.font-size: 24
shape: rectangle
}
w2: "Worker 2" {
style.font-size: 24
shape: rectangle
}
w3: "Worker N" {
style.font-size: 24
shape: rectangle
}
}
components -> discovery: "discover" {
style.font-size: 22
}
workers -> discovery: "register" {
style.font-size: 22
}
direction: right
dr: "DistributedRuntime" {
style.font-size: 28
ns: "• Namespace" {
style.font-size: 22
shape: text
}
comp: "• Components" {
style.font-size: 22
shape: text
}
ep: "• Endpoints" {
style.font-size: 22
shape: text
}
}
lease: "Primary Lease\nTTL: 10s" {
style.font-size: 24
shape: rectangle
style.bold: true
}
etcd: etcd {
style.font-size: 28
shape: cylinder
}
dr -> lease
lease -> etcd: "Keep-Alive\nHeartbeat" {
style.font-size: 22
}
direction: down
title: "Event Plane" {
style.font-size: 36
shape: text
near: top-center
}
components: "Subscribers" {
style.font-size: 28
direction: right
frontend: Frontend {
style.font-size: 24
shape: rectangle
}
router: Router {
style.font-size: 24
shape: rectangle
}
planner: Planner {
style.font-size: 24
shape: rectangle
}
}
transport: "Transport (choose one)" {
style.font-size: 32
direction: right
nats: "NATS Transport" {
style.font-size: 28
server: "NATS Server" {
style.font-size: 22
shape: cylinder
}
subjects: "Subject-scoped\npub/sub" {
style.font-size: 22
shape: text
}
}
zmq: "ZMQ Transport" {
style.font-size: 28
sockets: "PUB/SUB\nSockets" {
style.font-size: 22
shape: rectangle
}
peer: "Peer-to-peer\n(no server)" {
style.font-size: 22
shape: text
}
}
}
workers: "Publishers" {
style.font-size: 28
direction: right
w1: "Worker 1" {
style.font-size: 24
shape: rectangle
}
w2: "Worker 2" {
style.font-size: 24
shape: rectangle
}
w3: "Worker N" {
style.font-size: 24
shape: rectangle
}
}
events: "Event Types" {
style.font-size: 28
direction: right
kv: "KV Cache Events" {
style.font-size: 22
shape: rectangle
}
load: "Load Metrics" {
style.font-size: 22
shape: rectangle
}
seq: "Sequence Tracking" {
style.font-size: 22
shape: rectangle
}
}
components -> transport: "subscribe" {
style.font-size: 22
}
workers -> transport: "publish" {
style.font-size: 22
}
events -> workers: "" {
style.stroke-dash: 3
}
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
---
# Discovery Plane
Dynamo's service discovery layer lets components find each other at runtime. Workers register their endpoints when they start, and frontends discover them automatically.
The discovery backend adapts to the deployment environment.
![Discovery plane architecture showing Kubernetes and etcd backends](../../assets/img/discovery-plane.svg)
## Discovery Backends
| Deployment | Discovery Backend | Configuration |
|------------|-------------------|---------------|
| **Kubernetes** (with Dynamo operator) | Native K8s (CRDs, EndpointSlices) | Operator sets `DYN_DISCOVERY_BACKEND=kubernetes` |
| **Bare metal / Local** (default) | etcd | `ETCD_ENDPOINTS` (defaults to `http://localhost:2379`) |
> **Note:** The runtime always defaults to etcd (`kv_store`). Kubernetes discovery must be explicitly enabled -- the Dynamo operator handles this automatically.
## Kubernetes Discovery
When running on Kubernetes with the Dynamo operator, service discovery uses native Kubernetes resources instead of etcd.
### How It Works
1. Workers register their endpoints by creating **DynamoWorkerMetadata** custom resources.
2. **EndpointSlices** signal pod readiness to the system.
3. Components watch for CRD changes to discover available workers.
### Benefits
- No external etcd cluster required.
- Native integration with Kubernetes pod lifecycle.
- Automatic cleanup when pods terminate.
- Works with standard Kubernetes RBAC.
### Environment Variables (Injected by Operator)
| Variable | Description |
|----------|-------------|
| `DYN_DISCOVERY_BACKEND` | Set to `kubernetes` |
| `POD_NAME` | Current pod name |
| `POD_NAMESPACE` | Current namespace |
| `POD_UID` | Pod unique identifier |
## etcd Discovery (Default)
When `DYN_DISCOVERY_BACKEND` is not set (or set to `kv_store`), etcd is used for service discovery.
### Connection Configuration
| Variable | Description | Default |
|----------|-------------|---------|
| `ETCD_ENDPOINTS` | Comma-separated etcd URLs | `http://localhost:2379` |
| `ETCD_AUTH_USERNAME` | Basic auth username | None |
| `ETCD_AUTH_PASSWORD` | Basic auth password | None |
| `ETCD_AUTH_CA` | CA certificate path (TLS) | None |
| `ETCD_AUTH_CLIENT_CERT` | Client certificate path | None |
| `ETCD_AUTH_CLIENT_KEY` | Client key path | None |
Example:
```bash
export ETCD_ENDPOINTS=http://etcd-0:2379,http://etcd-1:2379,http://etcd-2:2379
```
### Service Registration
Workers register their endpoints in etcd with a key hierarchy:
```
/services/{namespace}/{component}/{endpoint}/{instance_id}
```
For example:
```
/services/vllm-agg/backend/generate/694d98147d54be25
```
Frontends and routers discover available workers by watching the relevant prefix and receiving real-time updates when workers join or leave.
### Lease-Based Cleanup
Each runtime maintains a lease with etcd (default TTL: 10 seconds). If a worker crashes or loses connectivity:
![Lease lifecycle showing DistributedRuntime keep-alive heartbeat to etcd](../../assets/img/discovery-plane-lease.svg)
1. Keep-alive heartbeats stop.
2. The lease expires after the TTL.
3. All registered endpoints are automatically deleted.
4. Clients receive removal events and reroute traffic to healthy workers.
This ensures stale endpoints are cleaned up without manual intervention.
## KV Store
Dynamo provides a KV store abstraction for storing metadata (endpoint instances, model deployment cards, event channels). Multiple backends are supported:
| Backend | Use Case |
|---------|----------|
| etcd | Production deployments |
| Memory | Testing and development |
| NATS | NATS-only deployments |
| File | Local persistence |
## Operational Guidance
### Use Kubernetes Discovery on K8s
The Dynamo operator automatically sets `DYN_DISCOVERY_BACKEND=kubernetes` for pods. No additional setup required.
### Deploy an etcd Cluster for Bare Metal
For bare-metal production deployments, deploy a 3-node etcd cluster for high availability.
### Tune Lease TTLs
Balance between failure detection speed and overhead:
- **Short TTL (5s)** -- Faster failure detection, more keep-alive traffic.
- **Long TTL (30s)** -- Less overhead, slower detection.
The default (10s) is a reasonable starting point for most deployments.
## Related Documentation
- [Event Plane](event-plane.md) -- Pub/sub for KV cache events and worker metrics
- [Distributed Runtime](distributed-runtime.md) -- Runtime architecture
- [Request Plane](request-plane.md) -- Request transport configuration
- [Fault Tolerance](../fault-tolerance/README.md) -- Failure handling
...@@ -3,430 +3,137 @@ ...@@ -3,430 +3,137 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
--- ---
# Event Plane Architecture # Dynamo Event Plane
This document describes Dynamo's event plane architecture, which handles service discovery, coordination, and event distribution using etcd and NATS. The event plane provides Dynamo with a pub/sub layer for near real-time event exchange between components. It delivers KV cache updates, worker load metrics, and sequence tracking events, enabling features like KV-aware routing and disaggregated serving.
## Overview ## When Is the Event Plane Used?
Dynamo's coordination layer adapts to the deployment environment: Key use cases:
| Deployment | Service Discovery | KV Events | Request Plane | - **KV cache events** -- Workers publish cache state so the router can make cache-aware scheduling decisions.
|------------|-------------------|-----------|---------------| - **Worker load metrics** -- Workers report utilization so the router can balance load.
| **Kubernetes** (with operator) | Native K8s (CRDs, EndpointSlices) | NATS (optional) | TCP | - **Sequence tracking** -- Coordinates active sequences across router replicas for fault-tolerant routing.
| **Bare metal / Local** (default) | etcd | NATS (optional) | TCP |
> **Note:** The runtime always defaults to `kv_store` (etcd) for service discovery. Kubernetes deployments must explicitly set `DYN_DISCOVERY_BACKEND=kubernetes` - the Dynamo operator handles this automatically. ![Event plane architecture showing NATS and ZMQ transport options connecting Frontend, Planner, and Worker](../../assets/img/event-plane-transport.svg)
![Coordination Layer showing Service Discovery and NATS connecting to Frontend, Planner, and Worker](/assets/img/event-plane-coordination.svg) ## Choosing a Transport
## Kubernetes-Native Service Discovery The event plane supports two transports:
When running on Kubernetes with the Dynamo operator, service discovery uses native Kubernetes resources instead of etcd. | | NATS (default) | ZMQ |
|---|---|---|
| **External infrastructure** | Requires a NATS server | None (peer-to-peer) |
| **Setup complexity** | Simple -- point at a NATS server | Automatic -- workers bind sockets and register via discovery |
| **Best for** | Large-scale deployments | Low operational overhead |
### Configuration ## Configuration
The operator explicitly sets: ### Transport Selection
```bash
DYN_DISCOVERY_BACKEND=kubernetes
```
> **Important:** This must be explicitly configured. The runtime defaults to `kv_store` in all environments.
### How It Works
1. **DynamoWorkerMetadata CRD**: Workers register their endpoints by creating/updating DynamoWorkerMetadata custom resources
2. **EndpointSlices**: Used to signal readiness status to the system
3. **K8s API Watches**: Components watch for CRD changes to discover available endpoints
### Benefits
- No external etcd cluster required
- Native integration with Kubernetes lifecycle
- Automatic cleanup when pods terminate
- Works with standard K8s RBAC
### Environment Variables (Injected by Operator)
| Variable | Description | Set the `DYN_EVENT_PLANE` environment variable to choose a transport:
|----------|-------------|
| `DYN_DISCOVERY_BACKEND` | Set to `kubernetes` |
| `POD_NAME` | Current pod name |
| `POD_NAMESPACE` | Current namespace |
| `POD_UID` | Pod unique identifier |
---
## etcd Architecture (Default for All Deployments)
When `DYN_DISCOVERY_BACKEND=kv_store` (the global default), etcd is used for service discovery.
### Connection Configuration
etcd connection is configured via environment variables:
| Variable | Description | Default |
|----------|-------------|---------|
| `ETCD_ENDPOINTS` | Comma-separated etcd URLs | `http://localhost:2379` |
| `ETCD_AUTH_USERNAME` | Basic auth username | None |
| `ETCD_AUTH_PASSWORD` | Basic auth password | None |
| `ETCD_AUTH_CA` | CA certificate path (TLS) | None |
| `ETCD_AUTH_CLIENT_CERT` | Client certificate path | None |
| `ETCD_AUTH_CLIENT_KEY` | Client key path | None |
Example:
```bash ```bash
export ETCD_ENDPOINTS=http://etcd-0:2379,http://etcd-1:2379,http://etcd-2:2379 # Use NATS (default -- no need to set explicitly)
``` export DYN_EVENT_PLANE=nats
### Lease Management
Each `DistributedRuntime` maintains a primary lease with etcd:
![DistributedRuntime connected to Primary Lease with Keep-Alive Heartbeat to etcd](/assets/img/event-plane-lease.svg)
**Lease Lifecycle:**
1. **Creation**: Lease created during `DistributedRuntime` initialization
2. **Keep-Alive**: Background task sends heartbeats at 50% of remaining TTL
3. **Expiration**: If heartbeats stop, lease expires after TTL (10 seconds default)
4. **Cleanup**: All keys associated with the lease are automatically deleted
**Automatic Recovery:** # Use ZMQ
export DYN_EVENT_PLANE=zmq
- Reconnection with exponential backoff (50ms to 5s)
- Deadline-based retry logic
- Cancellation token propagation
### Service Discovery
Endpoints are registered in etcd for dynamic discovery:
**Key Format:**
```
/services/{namespace}/{component}/{endpoint}/{instance_id}
``` ```
**Example:** Python components also accept this as a CLI flag:
```
/services/vllm-agg/backend/generate/694d98147d54be25
```
**Registration Data:** ```bash
```json # vLLM backend
{ python3 -m dynamo.vllm --event-plane zmq --model Qwen/Qwen3-0.6B
"namespace": "vllm-agg",
"component": "backend",
"endpoint": "generate",
"instance_id": 7587888160958628000,
"transport": {
"tcp": "192.168.1.10:9999"
}
}
```
### Discovery Queries
The discovery system supports multiple query patterns:
| Query Type | Pattern | Use Case |
|------------|---------|----------|
| `AllEndpoints` | `/services/` | List all services |
| `NamespacedEndpoints` | `/services/{namespace}/` | Filter by namespace |
| `ComponentEndpoints` | `/services/{namespace}/{component}/` | Filter by component |
| `Endpoint` | `/services/{namespace}/{component}/{endpoint}/` | Specific endpoint |
### Watch Functionality
Clients watch etcd prefixes for real-time updates:
```python
# Client watches for endpoint changes
watcher = etcd.watch_prefix("/services/vllm-agg/backend/generate/")
for event in watcher:
if event.type == "PUT":
# New endpoint registered
add_endpoint(event.value)
elif event.type == "DELETE":
# Endpoint removed (worker died)
remove_endpoint(event.key)
```
**Watch Features:**
- Initial state retrieval with `get_and_watch_prefix()`
- Automatic reconnection on stream failure
- Revision tracking for no-event-loss guarantees
- Event types: `PUT` (create/update) and `DELETE`
### Distributed Locks
etcd provides distributed locking for coordination:
**Lock Types:**
| Type | Key Pattern | Behavior |
|------|-------------|----------|
| Write Lock | `v1/{prefix}/writer` | Exclusive (no readers/writers) |
| Read Lock | `v1/{prefix}/readers/{id}` | Shared (multiple readers) |
**Operations:**
```rust
// Non-blocking write lock
let lock = client.try_write_lock("my_resource").await?;
// Blocking read lock with polling (100ms intervals) # SGLang backend
let lock = client.read_lock_with_wait("my_resource").await?; python3 -m dynamo.sglang --event-plane zmq --model Qwen/Qwen3-0.6B
``` ```
## NATS Architecture ### Environment Variables
### When NATS is Used
NATS is used for:
1. **KV Cache Events**: Real-time KV cache state updates for routing
2. **Router Replica Sync**: Synchronizing router state across replicas
3. **Legacy Request Plane**: NATS-based request transport (optional)
### Configuration
| Variable | Description | Default | | Variable | Description | Default |
|----------|-------------|---------| |----------|-------------|---------|
| `NATS_SERVER` | NATS server URL | `nats://localhost:4222` | | `DYN_EVENT_PLANE` | Transport: `nats` or `zmq` | `nats` |
| `NATS_SERVER` | NATS server URL (NATS transport only) | `nats://localhost:4222` |
### Disabling NATS
For deployments without KV-aware routing: ## NATS Transport
```bash When using NATS (`DYN_EVENT_PLANE=nats` or unset):
# Disable NATS and KV events
python -m dynamo.frontend --no-kv-events
```
This enables "approximate mode" for KV routing without event persistence. - Requires a running NATS server. Set `NATS_SERVER` if it is not on `localhost:4222`.
- Events are published to NATS subjects scoped by namespace and component.
- Built-in reconnection and message buffering during brief disconnections.
### Event Publishing Example setup:
Components publish events to NATS subjects:
```rust
pub trait EventPublisher {
async fn publish(&self, event: &str, data: &[u8]) -> Result<()>;
async fn publish_serialized<T: Serialize>(&self, event: &str, data: &T) -> Result<()>;
}
```
**Subject Naming:**
```
{base_subject}.{event_name}
```
Example:
```
vllm-agg.backend.kv_cache_update
```
### Event Subscription
Components subscribe to events:
```rust
pub trait EventSubscriber {
async fn subscribe(&self, topic: &str) -> Result<Subscriber>;
async fn subscribe_typed<T: DeserializeOwned>(&self, topic: &str) -> Result<TypedSubscriber<T>>;
}
```
### JetStream Persistence
For durable event delivery, NATS JetStream provides:
- Message persistence
- Replay from offset
- Consumer groups for load balancing
- Acknowledgment tracking
## Key-Value Store Abstraction
Dynamo provides a unified KV store interface supporting multiple backends:
### Supported Backends
| Backend | Use Case | Configuration |
|---------|----------|---------------|
| `EtcdStore` | Production deployments | `ETCD_ENDPOINTS` |
| `MemoryStore` | Testing, development | Default |
| `NatsStore` | NATS-only deployments | `NATS_SERVER` |
| `FileStore` | Local persistence | File path |
### Store Interface
```rust
pub trait KvStore {
async fn get(&self, bucket: &str, key: &str) -> Result<Option<Vec<u8>>>;
async fn put(&self, bucket: &str, key: &str, value: &[u8]) -> Result<()>;
async fn delete(&self, bucket: &str, key: &str) -> Result<()>;
async fn watch(&self, bucket: &str) -> Result<WatchStream>;
}
```
### Buckets
Data is organized into logical buckets:
| Bucket | Purpose |
|--------|---------|
| `v1/instances` | Endpoint instance registry |
| `v1/mdc` | Model deployment cards |
## Typed Prefix Watcher
For type-safe watching of etcd prefixes:
```rust
// Watch and maintain HashMap of deserialized values
let watcher = watch_prefix_with_extraction::<DiscoveryInstance>(
&etcd_client,
"/services/vllm-agg/",
lease_id_extractor,
value_extractor,
).await?;
// Receive updates via watch channel
let instances = watcher.borrow();
```
**Key Extractors:**
| Extractor | Description |
|-----------|-------------|
| `lease_id()` | Use lease ID as key |
| `key_string()` | Extract key with prefix stripping |
| `full_key_string()` | Use full etcd key |
## Reliability Features
### Connection Resilience
**etcd Reconnection:**
- Exponential backoff: 50ms to 5s
- Deadline-based retry logic
- Mutex ensures single concurrent reconnect
**NATS Reconnection:**
- Built-in reconnection in NATS client
- Configurable max reconnect attempts
- Buffering during disconnection
### Lease-Based Cleanup
When a worker crashes or loses connectivity:
1. Keep-alive heartbeats stop
2. Lease expires after TTL (10 seconds)
3. All registered endpoints automatically deleted
4. Clients receive DELETE watch events
5. Traffic reroutes to healthy workers
### Transaction Safety
etcd transactions ensure atomic operations:
```rust
// Atomic create-if-not-exists
let txn = Txn::new()
.when([Compare::create_revision(key, CompareOp::Equal, 0)])
.and_then([Op::put(key, value, options)]);
etcd_client.txn(txn).await?;
```
This prevents race conditions in concurrent service registration.
## Operational Modes
### Kubernetes Mode (Requires Explicit Configuration)
Native Kubernetes service discovery:
```bash ```bash
# Operator explicitly sets this (not auto-detected): export NATS_SERVER=nats://nats-server:4222
export DYN_DISCOVERY_BACKEND=kubernetes export DYN_EVENT_PLANE=nats
# Workers register via K8s CRDs # Start workers -- they publish events to NATS automatically
python -m dynamo.vllm --model Qwen/Qwen3-0.6B python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
# Frontend discovers workers via K8s API # Start frontend -- it subscribes to events from NATS automatically
python -m dynamo.frontend python3 -m dynamo.frontend --router-mode kv
``` ```
No etcd or NATS required for basic operation when using K8s discovery. ## ZMQ Transport
### KV Store Mode (Global Default) When using ZMQ (`DYN_EVENT_PLANE=zmq`):
Full service discovery with etcd: - No external server required. Each worker binds a ZMQ PUB socket and advertises its address through the discovery system.
- Subscribers automatically discover and connect to all active publishers.
- When publishers come and go (e.g., workers scaling up/down), subscribers dynamically adjust their connections.
Example setup:
```bash ```bash
# This is the default - no configuration needed export DYN_EVENT_PLANE=zmq
# export DYN_DISCOVERY_BACKEND=kv_store # (implicit)
# Workers register with etcd # Start workers -- each binds a ZMQ socket, registers with discovery
python -m dynamo.vllm --model Qwen/Qwen3-0.6B python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B
# Frontend discovers workers via etcd # Start frontend -- discovers workers and connects directly
python -m dynamo.frontend python3 -m dynamo.frontend --router-mode kv
``` ```
### KV-Aware Routing (Optional) ## Disabling the Event Plane
Enable NATS for KV cache event tracking: If you do not need KV-aware routing, you can disable the event plane entirely:
```bash ```bash
# Default: KV events enabled (requires NATS) python3 -m dynamo.frontend --router-mode kv --no-kv-events
python -m dynamo.frontend --router-mode kv
# Disable KV events for prediction-based routing (no NATS)
python -m dynamo.frontend --router-mode kv --no-kv-events
``` ```
With `--no-kv-events`: With `--no-kv-events`:
- Router predicts cache state based on routing decisions
- TTL-based expiration and LRU pruning
- No NATS infrastructure required
## Best Practices
### 1. Use Kubernetes Discovery on K8s - The router falls back to prediction-based cache-aware routing (estimates cache state from routing decisions).
- No NATS server or ZMQ sockets are needed.
- TTL-based expiration and LRU pruning keep predicted state from growing stale.
The Dynamo operator automatically sets `DYN_DISCOVERY_BACKEND=kubernetes` for pods. No additional setup required when using the operator. ## Deployment Modes
### 2. For Bare Metal: Deploy etcd Cluster ### Bare Metal / Local
For bare-metal production deployments, deploy a 3-node etcd cluster for high availability. Both transports work out of the box:
### 3. Configure Appropriate TTLs (etcd mode)
Balance between detection speed and overhead:
- **Short TTL (5s)**: Faster failure detection, more keep-alive traffic
- **Long TTL (30s)**: Less overhead, slower detection
### 4. KV Routing Without NATS
For simpler deployments without NATS:
```bash ```bash
# Use prediction-based KV routing # NATS (requires nats-server running)
python -m dynamo.frontend --router-mode kv --no-kv-events export NATS_SERVER=nats://localhost:4222
# OR ZMQ (no extra infrastructure)
export DYN_EVENT_PLANE=zmq
``` ```
This provides KV-aware routing with reduced accuracy but no NATS dependency. ### Kubernetes (with Dynamo Operator)
The operator can inject `DYN_EVENT_PLANE` into pods. The same transport options apply. If using NATS, deploy a NATS server in the cluster and set `NATS_SERVER` accordingly.
## Related Documentation ## Related Documentation
- [Distributed Runtime](distributed-runtime.md) - Runtime architecture - [Discovery Plane](discovery-plane.md) -- Service discovery and coordination (etcd, Kubernetes)
- [Request Plane](request-plane.md) - Request transport configuration - [Distributed Runtime](distributed-runtime.md) -- Runtime architecture
- [Fault Tolerance](../fault-tolerance/README.md) - Failure handling - [Request Plane](request-plane.md) -- Request transport configuration
- [Fault Tolerance](../fault-tolerance/README.md) -- Failure handling
...@@ -128,4 +128,5 @@ See [Fault Tolerance Testing](testing.md) for details. ...@@ -128,4 +128,5 @@ See [Fault Tolerance Testing](testing.md) for details.
- [Observability](../observability/README.md) - Metrics and monitoring - [Observability](../observability/README.md) - Metrics and monitoring
- [Distributed Runtime](../design-docs/distributed-runtime.md) - Service discovery architecture - [Distributed Runtime](../design-docs/distributed-runtime.md) - Service discovery architecture
- [Event Plane](../design-docs/event-plane.md) - etcd and NATS coordination - [Event Plane](../design-docs/event-plane.md) - Pub/sub for KV cache events and worker metrics
- [Discovery Plane](../design-docs/discovery-plane.md) - Service discovery and coordination
...@@ -194,6 +194,8 @@ navigation: ...@@ -194,6 +194,8 @@ navigation:
path: ../pages/design-docs/disagg-serving.md path: ../pages/design-docs/disagg-serving.md
- page: Distributed Runtime - page: Distributed Runtime
path: ../pages/design-docs/distributed-runtime.md path: ../pages/design-docs/distributed-runtime.md
- page: Discovery Plane
path: ../pages/design-docs/discovery-plane.md
- page: Request Plane - page: Request Plane
path: ../pages/design-docs/request-plane.md path: ../pages/design-docs/request-plane.md
- page: Event Plane - page: Event Plane
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment