feat: adding kvbm-engine (#6773)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: adding kvbm-engine (#6773)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
008683d6 · Ryan Olson · GitHub · cf79c4fc · 008683d6 · 008683d6
Unverified Commit 008683d6 authored Apr 08, 2026 by Ryan Olson Committed by GitHub Apr 08, 2026
20 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,8 @@ members = [
    "lib/kv-router",
    "lib/memory",
    "lib/kvbm-common",
+    "lib/kvbm-config",
+    "lib/kvbm-engine",
    "lib/kvbm-kernels",
    "lib/kvbm-logical",
    "lib/kvbm-physical",
@@ -19,9 +21,6 @@ members = [
    "lib/bench",
    "lib/bindings/c",
    "lib/bindings/python/codegen",
-    "lib/velo-common",
-    "lib/velo-transports",
-    "lib/velo-events",
 ]
 resolver = "3"
@@ -48,16 +47,17 @@ dynamo-protocols = { path = "lib/protocols", version = "1.0.0" }
 dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
 fastokens = { version = "0.1.0" }
 # kvbm
-kvbm-common = { path = "lib/kvbm-common", version = "0.1.0" }
+kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
-kvbm-kernels = { path = "lib/kvbm-kernels", version = "0.1.0" }
+kvbm-config = { path = "lib/kvbm-config", version = "1.0.0" }
-kvbm-logical = { path = "lib/kvbm-logical", version = "0.1.0" }
+kvbm-engine = { path = "lib/kvbm-engine", version = "1.0.0" }
-kvbm-physical = { path = "lib/kvbm-physical", version = "0.1.0" }
+kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
+kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
+kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }
 # velo
-velo-common = { path = "lib/velo-common", version = "0.1.0" }
+velo = { version = "0.1.0" }
-velo-transports = { path = "lib/velo-transports", version = "0.1.0" }
-velo-events = { path = "lib/velo-events", version = "0.1.0" }
 # External dependencies
 anyhow = { version = "1" }

--- a/lib/bindings/kvbm/Cargo.lock
+++ b/lib/bindings/kvbm/Cargo.lock
@@ -1249,19 +1249,6 @@ dependencies = [
 "serde",
 ]
-[[package]]
-name = "dashmap"
-version = "5.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
-dependencies = [
- "cfg-if",
- "hashbrown 0.14.5",
- "lock_api",
- "once_cell",
- "parking_lot_core",
-]
 [[package]]
 name = "dashmap"
 version = "6.1.0"
@@ -1523,7 +1510,7 @@ version = "1.0.0"
 dependencies = [
 "anyhow",
 "async-trait",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-runtime",
@@ -1566,10 +1553,11 @@ dependencies = [
 "bytes",
 "chrono",
 "cudarc",
- "dashmap 5.5.3",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dialoguer",
+ "dynamo-config",
 "dynamo-kv-router",
 "dynamo-memory",
 "dynamo-mocker",
@@ -1636,6 +1624,7 @@ dependencies = [
 "anyhow",
 "cudarc",
 "libc",
+ "libloading 0.8.9",
 "nix 0.30.1",
 "nixl-sys",
 "offset-allocator",
@@ -1649,7 +1638,7 @@ name = "dynamo-mocker"
 version = "1.0.0"
 dependencies = [
 "anyhow",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-kv-router",
@@ -1718,7 +1707,7 @@ dependencies = [
 "blake3",
 "bytes",
 "chrono",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-config",
@@ -1775,7 +1764,7 @@ version = "1.0.0"
 dependencies = [
 "bs58",
 "bytemuck",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "serde",
 "thiserror 2.0.18",
@@ -2014,9 +2003,9 @@ dependencies = [
 [[package]]
 name = "fastrand"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 dependencies = [
 "getrandom 0.3.4",
 ]

--- a/lib/bindings/python/Cargo.lock
+++ b/lib/bindings/python/Cargo.lock
@@ -1267,19 +1267,6 @@ dependencies = [
 "serde",
 ]
-[[package]]
-name = "dashmap"
-version = "5.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
-dependencies = [
- "cfg-if",
- "hashbrown 0.14.5",
- "lock_api",
- "once_cell",
- "parking_lot_core",
-]
 [[package]]
 name = "dashmap"
 version = "6.1.0"
@@ -1532,7 +1519,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "axum",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-runtime",
@@ -1578,10 +1565,11 @@ dependencies = [
 "bytes",
 "chrono",
 "cudarc",
- "dashmap 5.5.3",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dialoguer",
+ "dynamo-config",
 "dynamo-kv-router",
 "dynamo-memory",
 "dynamo-mocker",
@@ -1651,6 +1639,7 @@ dependencies = [
 "anyhow",
 "cudarc",
 "libc",
+ "libloading 0.8.9",
 "nix 0.30.1",
 "nixl-sys",
 "offset-allocator",
@@ -1664,7 +1653,7 @@ name = "dynamo-mocker"
 version = "1.0.0"
 dependencies = [
 "anyhow",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-kv-router",
@@ -1725,7 +1714,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "clap",
- "dashmap 6.1.0",
+ "dashmap",
 "dynamo-kv-router",
 "dynamo-llm",
 "dynamo-mocker",
@@ -1765,7 +1754,7 @@ dependencies = [
 "bytes",
 "chrono",
 "cudarc",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "derive_builder",
 "dynamo-config",
@@ -1822,7 +1811,7 @@ version = "1.0.0"
 dependencies = [
 "bs58",
 "bytemuck",
- "dashmap 6.1.0",
+ "dashmap",
 "derive-getters",
 "serde",
 "thiserror 2.0.18",
@@ -2061,9 +2050,9 @@ dependencies = [
 [[package]]
 name = "fastrand"
-version = "2.4.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
 dependencies = [
 "getrandom 0.3.4",
 ]

--- a/lib/kvbm-common/Cargo.toml
+++ b/lib/kvbm-common/Cargo.toml
@@ -3,7 +3,7 @@
 [package]
 name = "kvbm-common"
-version = "0.1.0"
+version = "1.0.0"
 edition.workspace = true
 description.workspace = true
 authors.workspace = true

--- a/lib/kvbm-config/Cargo.toml
+++ b/lib/kvbm-config/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+[package]
+name = "kvbm-config"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+description = "KVBM Configuration Library for Tokio, Rayon, and Messenger runtimes"
+[features]
+default = []
+rayon = ["dep:rayon"]
+nvtx = ["dep:nvtx"]
+[dependencies]
+anyhow = { workspace = true }
+figment = { version = "0.10", features = ["env", "toml", "json"] }
+nix = { version = "0.30.1", features = ["net"] }
+serde = { workspace = true }
+thiserror = { workspace = true }
+tokio = { workspace = true }
+tracing = { workspace = true }
+validator = { workspace = true }
+# Optional dependencies
+rayon = { version = "1.10", optional = true }
+nvtx = { version = "1.3", optional = true }
+# Velo dependencies
+velo = { workspace = true }
+# Memory dependencies (for NixL)
+dynamo-memory = { workspace = true }
+[dev-dependencies]
+serde_json = { workspace = true }
+temp-env = { version = "0.3.6" }
+tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
--- a/lib/kvbm-config/src/cache.rs
+++ b/lib/kvbm-config/src/cache.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Cache tier configuration for KVBM.
+//!
+//! Defines configuration for G2 (host/pinned memory) and G3 (disk) cache tiers,
+//! as well as the parallelism mode for distributed workers.
+//!
+//! The leader uses this configuration to coordinate cache tier creation on workers.
+use std::path::PathBuf;
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Parallelism strategy for KV cache across workers.
+///
+/// This determines how KV blocks are distributed and transferred across
+/// multiple workers in a distributed inference setup.
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ParallelismMode {
+    /// Tensor parallel: each worker has a shard of each KV block.
+    ///
+    /// This is the standard approach for tensor-parallel inference where
+    /// attention heads are split across workers. Each worker stores and
+    /// transfers only its portion of each KV block.
+    ///
+    /// All workers have G1, G2, and G3 tiers. Operations execute on all
+    /// workers simultaneously (SPMD).
+    #[default]
+    TensorParallel,
+    /// Replicated data: all workers have full KV blocks (MLA scenario).
+    ///
+    /// In MLA (Multi-head Latent Attention) architectures, KV blocks are
+    /// replicated rather than sharded. Only rank 0 has G2/G3 storage;
+    /// data is broadcast to other ranks after loading to G1.
+    ///
+    /// This reduces storage requirements on non-rank-0 workers and is
+    /// suitable when the model's KV representation is the same across
+    /// all attention heads.
+    ReplicatedData,
+}
+/// Host cache configuration (G2 tier - pinned CPU memory).
+///
+/// The host cache provides a staging area for KV blocks between GPU and disk.
+/// Memory is allocated as pinned (page-locked) for efficient DMA transfers.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate, Default)]
+pub struct HostCacheConfig {
+    /// Cache size in gigabytes.
+    /// Used to compute num_blocks if not explicitly set.
+    pub cache_size_gb: Option<f64>,
+    /// Explicit number of blocks for the host cache.
+    /// Takes priority over cache_size_gb if set.
+    pub num_blocks: Option<usize>,
+}
+impl HostCacheConfig {
+    /// Compute the number of blocks based on configuration and block size.
+    ///
+    /// Priority: explicit num_blocks > computed from cache_size_gb
+    ///
+    /// # Arguments
+    /// * `bytes_per_block` - Size of each block in bytes
+    ///
+    /// # Returns
+    /// Number of blocks, or None if neither num_blocks nor cache_size_gb is set,
+    /// or if bytes_per_block is zero.
+    pub fn compute_num_blocks(&self, bytes_per_block: usize) -> Option<usize> {
+        if bytes_per_block == 0 {
+            return None;
+        }
+        self.num_blocks.or_else(|| {
+            self.cache_size_gb.map(|gb| {
+                // Convert GB to bytes and divide by block size
+                ((gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
+            })
+        })
+    }
+    /// Check if host cache is enabled (has any configuration).
+    pub fn is_enabled(&self) -> bool {
+        self.num_blocks.is_some() || self.cache_size_gb.is_some()
+    }
+}
+/// Disk cache configuration (G3 tier - persistent storage).
+///
+/// The disk cache provides extended capacity for KV blocks beyond GPU and host memory.
+/// Can use either GPU Direct Storage (GDS) for direct GPU-disk transfers or POSIX
+/// for regular file I/O.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate, Default)]
+pub struct DiskCacheConfig {
+    /// Cache size in gigabytes.
+    /// Used to compute num_blocks if not explicitly set.
+    pub cache_size_gb: Option<f64>,
+    /// Explicit number of blocks for the disk cache.
+    /// Takes priority over cache_size_gb if set.
+    pub num_blocks: Option<usize>,
+    /// Use GPU Direct Storage (GDS) if available.
+    /// When true, enables GDS_MT backend for direct GPU-disk transfers.
+    /// When false or GDS unavailable, falls back to POSIX backend.
+    #[serde(default)]
+    pub use_gds: bool,
+    /// Storage path for disk cache files.
+    /// If None, a default path will be used.
+    pub storage_path: Option<PathBuf>,
+}
+impl DiskCacheConfig {
+    /// Compute the number of blocks based on configuration and block size.
+    ///
+    /// Priority: explicit num_blocks > computed from cache_size_gb
+    ///
+    /// # Arguments
+    /// * `bytes_per_block` - Size of each block in bytes
+    ///
+    /// # Returns
+    /// Number of blocks, or None if neither num_blocks nor cache_size_gb is set,
+    /// or if bytes_per_block is zero.
+    pub fn compute_num_blocks(&self, bytes_per_block: usize) -> Option<usize> {
+        if bytes_per_block == 0 {
+            return None;
+        }
+        self.num_blocks.or_else(|| {
+            self.cache_size_gb.map(|gb| {
+                // Convert GB to bytes and divide by block size
+                ((gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
+            })
+        })
+    }
+    /// Check if disk cache is enabled (has any configuration).
+    pub fn is_enabled(&self) -> bool {
+        self.num_blocks.is_some() || self.cache_size_gb.is_some()
+    }
+}
+/// Top-level cache configuration.
+///
+/// Groups host (G2) and disk (G3) cache configurations together,
+/// plus the parallelism mode for distributed workers.
+///
+/// Use Figment profiles to configure different cache settings for leader vs worker.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct CacheConfig {
+    /// Host cache (G2 tier) - pinned CPU memory.
+    #[serde(default)]
+    #[validate(nested)]
+    pub host: HostCacheConfig,
+    /// Disk cache (G3 tier) - persistent storage.
+    /// Optional - only configure if disk caching is needed.
+    #[validate(nested)]
+    pub disk: Option<DiskCacheConfig>,
+    /// Parallelism mode for distributed workers.
+    ///
+    /// - `TensorParallel` (default): Each worker has a shard of each KV block
+    /// - `ReplicatedData`: Only rank 0 has G2/G3; data is broadcast on load
+    ///
+    /// Can be set via env var: `KVBM_CACHE_PARALLELISM=tensor_parallel|replicated_data`
+    #[serde(default)]
+    pub parallelism: ParallelismMode,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_host_cache_default() {
+        let config = HostCacheConfig::default();
+        assert!(config.cache_size_gb.is_none());
+        assert!(config.num_blocks.is_none());
+        assert!(!config.is_enabled());
+    }
+    #[test]
+    fn test_host_cache_explicit_blocks() {
+        let config = HostCacheConfig {
+            num_blocks: Some(1000),
+            cache_size_gb: Some(10.0), // Should be ignored
+        };
+        // With 1MB blocks, explicit num_blocks takes priority
+        let bytes_per_block = 1_000_000;
+        assert_eq!(config.compute_num_blocks(bytes_per_block), Some(1000));
+        assert!(config.is_enabled());
+    }
+    #[test]
+    fn test_host_cache_from_size_gb() {
+        let config = HostCacheConfig {
+            num_blocks: None,
+            cache_size_gb: Some(10.0), // 10 GB
+        };
+        // With 1MB blocks: 10GB / 1MB = 10,000 blocks
+        let bytes_per_block = 1_000_000;
+        assert_eq!(config.compute_num_blocks(bytes_per_block), Some(10_000));
+        assert!(config.is_enabled());
+    }
+    #[test]
+    fn test_disk_cache_default() {
+        let config = DiskCacheConfig::default();
+        assert!(config.cache_size_gb.is_none());
+        assert!(config.num_blocks.is_none());
+        assert!(!config.use_gds);
+        assert!(config.storage_path.is_none());
+        assert!(!config.is_enabled());
+    }
+    #[test]
+    fn test_disk_cache_with_gds() {
+        let config = DiskCacheConfig {
+            num_blocks: Some(5000),
+            cache_size_gb: None,
+            use_gds: true,
+            storage_path: Some(PathBuf::from("/mnt/nvme/kv_cache")),
+        };
+        assert!(config.use_gds);
+        assert_eq!(
+            config.storage_path,
+            Some(PathBuf::from("/mnt/nvme/kv_cache"))
+        );
+        assert!(config.is_enabled());
+    }
+    #[test]
+    fn test_parallelism_mode_default() {
+        let mode = ParallelismMode::default();
+        assert_eq!(mode, ParallelismMode::TensorParallel);
+    }
+    #[test]
+    fn test_parallelism_mode_serde() {
+        // Test serialization
+        let tp = ParallelismMode::TensorParallel;
+        let json = serde_json::to_string(&tp).unwrap();
+        assert_eq!(json, "\"tensor_parallel\"");
+        let rd = ParallelismMode::ReplicatedData;
+        let json = serde_json::to_string(&rd).unwrap();
+        assert_eq!(json, "\"replicated_data\"");
+        // Test deserialization
+        let mode: ParallelismMode = serde_json::from_str("\"tensor_parallel\"").unwrap();
+        assert_eq!(mode, ParallelismMode::TensorParallel);
+        let mode: ParallelismMode = serde_json::from_str("\"replicated_data\"").unwrap();
+        assert_eq!(mode, ParallelismMode::ReplicatedData);
+    }
+    #[test]
+    fn test_cache_config_with_parallelism() {
+        let config = CacheConfig {
+            host: HostCacheConfig::default(),
+            disk: None,
+            parallelism: ParallelismMode::ReplicatedData,
+        };
+        assert_eq!(config.parallelism, ParallelismMode::ReplicatedData);
+    }
+    #[test]
+    fn test_cache_config_default_parallelism() {
+        let config = CacheConfig::default();
+        assert_eq!(config.parallelism, ParallelismMode::TensorParallel);
+    }
+}
--- a/lib/kvbm-config/src/discovery.rs
+++ b/lib/kvbm-config/src/discovery.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Discovery configuration for Nova peer discovery.
+//!
+//! Supports three discovery backends:
+//! - **Etcd**: Centralized discovery using etcd key-value store
+//! - **P2P**: Decentralized discovery using libp2p DHT
+//! - **Filesystem**: File-based discovery for development/testing
+use std::path::PathBuf;
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Discovery configuration - only one type can be active at a time.
+///
+/// # JSON Configuration Examples
+///
+/// ## Etcd Discovery
+/// ```json
+/// {
+///   "type": "etcd",
+///   "cluster_id": "my-cluster",
+///   "endpoints": ["http://etcd1:2379", "http://etcd2:2379"],
+///   "ttl_secs": 60
+/// }
+/// ```
+///
+/// ## P2P Discovery
+/// ```json
+/// {
+///   "type": "p2p",
+///   "cluster_id": "my-cluster",
+///   "listen_port": 0,
+///   "bootstrap_peers": ["192.168.1.10:4001"],
+///   "enable_mdns": true
+/// }
+/// ```
+///
+/// ## Filesystem Discovery
+/// ```json
+/// {
+///   "type": "filesystem",
+///   "path": "/tmp/discovery.json"
+/// }
+/// ```
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "lowercase")]
+pub enum DiscoveryConfig {
+    /// Etcd-based discovery (centralized).
+    Etcd(EtcdDiscoveryConfig),
+    /// P2P discovery using libp2p DHT (decentralized).
+    P2p(P2pDiscoveryConfig),
+    /// Filesystem-based discovery (for dev/testing).
+    Filesystem(FilesystemDiscoveryConfig),
+}
+/// Etcd discovery configuration.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct EtcdDiscoveryConfig {
+    /// Cluster ID / key prefix for discovery (required).
+    pub cluster_id: String,
+    /// Etcd endpoints (default: ["http://localhost:2379"]).
+    #[serde(default = "default_etcd_endpoints")]
+    pub endpoints: Vec<String>,
+    /// Lease TTL in seconds (default: 60, range: 10-600).
+    #[serde(default = "default_etcd_ttl")]
+    #[validate(range(min = 10, max = 600))]
+    pub ttl_secs: u64,
+    /// Operation timeout in seconds (default: 30).
+    #[serde(default = "default_operation_timeout")]
+    pub operation_timeout_secs: u64,
+    /// Max retries for operations (default: 3).
+    #[serde(default = "default_max_retries")]
+    #[validate(range(min = 0, max = 10))]
+    pub max_retries: u32,
+}
+/// P2P discovery configuration.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct P2pDiscoveryConfig {
+    /// Cluster ID / swarm key (required).
+    pub cluster_id: String,
+    /// Listen port (default: 0 = OS-assigned).
+    #[serde(default)]
+    pub listen_port: u16,
+    /// Bootstrap peer addresses.
+    #[serde(default)]
+    pub bootstrap_peers: Vec<String>,
+    /// DHT replication factor (default: 3).
+    #[serde(default = "default_replication_factor")]
+    pub replication_factor: usize,
+    /// Enable mDNS for local network discovery (default: false).
+    #[serde(default)]
+    pub enable_mdns: bool,
+    /// Record TTL in seconds (default: 600).
+    #[serde(default = "default_record_ttl")]
+    pub record_ttl_secs: u64,
+}
+/// Filesystem discovery configuration.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FilesystemDiscoveryConfig {
+    /// Path to the discovery JSON file.
+    pub path: PathBuf,
+}
+fn default_etcd_endpoints() -> Vec<String> {
+    vec!["http://localhost:2379".to_string()]
+}
+fn default_etcd_ttl() -> u64 {
+    60
+}
+fn default_operation_timeout() -> u64 {
+    30
+}
+fn default_max_retries() -> u32 {
+    3
+}
+fn default_replication_factor() -> usize {
+    3
+}
+fn default_record_ttl() -> u64 {
+    600
+}
+impl Default for EtcdDiscoveryConfig {
+    fn default() -> Self {
+        Self {
+            cluster_id: String::new(),
+            endpoints: default_etcd_endpoints(),
+            ttl_secs: default_etcd_ttl(),
+            operation_timeout_secs: default_operation_timeout(),
+            max_retries: default_max_retries(),
+        }
+    }
+}
+impl Default for P2pDiscoveryConfig {
+    fn default() -> Self {
+        Self {
+            cluster_id: String::new(),
+            listen_port: 0,
+            bootstrap_peers: Vec::new(),
+            replication_factor: default_replication_factor(),
+            enable_mdns: false,
+            record_ttl_secs: default_record_ttl(),
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_deserialize_etcd_config() {
+        let json = r#"{
+            "type": "etcd",
+            "cluster_id": "test-cluster",
+            "endpoints": ["http://etcd1:2379"],
+            "ttl_secs": 120
+        }"#;
+        let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
+        match config {
+            DiscoveryConfig::Etcd(etcd) => {
+                assert_eq!(etcd.cluster_id, "test-cluster");
+                assert_eq!(etcd.endpoints, vec!["http://etcd1:2379"]);
+                assert_eq!(etcd.ttl_secs, 120);
+                assert_eq!(etcd.operation_timeout_secs, 30); // default
+                assert_eq!(etcd.max_retries, 3); // default
+            }
+            _ => panic!("Expected Etcd config"),
+        }
+    }
+    #[test]
+    fn test_deserialize_p2p_config() {
+        let json = r#"{
+            "type": "p2p",
+            "cluster_id": "test-cluster",
+            "listen_port": 4001,
+            "bootstrap_peers": ["192.168.1.10:4001"],
+            "enable_mdns": true
+        }"#;
+        let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
+        match config {
+            DiscoveryConfig::P2p(p2p) => {
+                assert_eq!(p2p.cluster_id, "test-cluster");
+                assert_eq!(p2p.listen_port, 4001);
+                assert_eq!(p2p.bootstrap_peers, vec!["192.168.1.10:4001"]);
+                assert!(p2p.enable_mdns);
+                assert_eq!(p2p.replication_factor, 3); // default
+                assert_eq!(p2p.record_ttl_secs, 600); // default
+            }
+            _ => panic!("Expected P2p config"),
+        }
+    }
+    #[test]
+    fn test_deserialize_filesystem_config() {
+        let json = r#"{
+            "type": "filesystem",
+            "path": "/tmp/discovery.json"
+        }"#;
+        let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
+        match config {
+            DiscoveryConfig::Filesystem(fs) => {
+                assert_eq!(fs.path, PathBuf::from("/tmp/discovery.json"));
+            }
+            _ => panic!("Expected Filesystem config"),
+        }
+    }
+    #[test]
+    fn test_serialize_etcd_config() {
+        let config = DiscoveryConfig::Etcd(EtcdDiscoveryConfig {
+            cluster_id: "my-cluster".to_string(),
+            endpoints: vec!["http://localhost:2379".to_string()],
+            ttl_secs: 60,
+            operation_timeout_secs: 30,
+            max_retries: 3,
+        });
+        let json = serde_json::to_string(&config).unwrap();
+        assert!(json.contains(r#""type":"etcd""#));
+        assert!(json.contains(r#""cluster_id":"my-cluster""#));
+    }
+    #[test]
+    fn test_etcd_default() {
+        let config = EtcdDiscoveryConfig::default();
+        assert!(config.cluster_id.is_empty());
+        assert_eq!(config.endpoints, vec!["http://localhost:2379"]);
+        assert_eq!(config.ttl_secs, 60);
+        assert_eq!(config.operation_timeout_secs, 30);
+        assert_eq!(config.max_retries, 3);
+    }
+    #[test]
+    fn test_p2p_default() {
+        let config = P2pDiscoveryConfig::default();
+        assert!(config.cluster_id.is_empty());
+        assert_eq!(config.listen_port, 0);
+        assert!(config.bootstrap_peers.is_empty());
+        assert_eq!(config.replication_factor, 3);
+        assert!(!config.enable_mdns);
+        assert_eq!(config.record_ttl_secs, 600);
+    }
+}
--- a/lib/kvbm-config/src/events.rs
+++ b/lib/kvbm-config/src/events.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Event publishing configuration for KV cache coordination.
+//!
+//! This module defines the configuration for the event publishing pipeline
+//! that broadcasts block registration/removal events to distributed consumers
+//! (e.g., KvbmHub for radix tree maintenance).
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Configuration for event publishing.
+///
+/// Events are broadcast when blocks are registered or removed from the cache.
+/// The pipeline batches events for efficient wire transmission.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct EventsConfig {
+    /// Whether event publishing is enabled.
+    ///
+    /// When disabled, no events are emitted and no publisher is started.
+    /// Default: false
+    #[serde(default)]
+    pub enabled: bool,
+    /// Batching configuration for the event pipeline.
+    #[serde(default)]
+    #[validate(nested)]
+    pub batching: BatchingConfig,
+    /// Broadcast channel capacity for the EventsManager.
+    ///
+    /// This determines how many events can be buffered before slow
+    /// subscribers start lagging. Default: 1024
+    #[serde(default = "default_channel_capacity")]
+    #[validate(range(min = 16, max = 65536))]
+    pub channel_capacity: usize,
+    /// Subject/topic pattern for publishing events.
+    ///
+    /// This is the NATS/messaging subject where events are published.
+    /// Default: "kvbm.events"
+    #[serde(default = "default_subject")]
+    pub subject: String,
+    /// Event emission policy.
+    ///
+    /// Determines which blocks trigger events:
+    /// - `power_of_two`: Only emit for blocks at power-of-2 positions (default)
+    /// - `all`: Emit for all blocks (testing/debugging)
+    #[serde(default)]
+    pub policy: EventPolicyConfig,
+}
+impl Default for EventsConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            batching: BatchingConfig::default(),
+            channel_capacity: default_channel_capacity(),
+            subject: default_subject(),
+            policy: EventPolicyConfig::default(),
+        }
+    }
+}
+fn default_channel_capacity() -> usize {
+    1024
+}
+fn default_subject() -> String {
+    "kvbm.events".to_string()
+}
+/// Batching configuration for the event pipeline.
+///
+/// Events are batched before publishing to reduce wire traffic.
+/// Batches are flushed when:
+/// - The window duration expires
+/// - The max batch size is reached
+/// - The event type switches (Create -> Remove or vice versa)
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct BatchingConfig {
+    /// Maximum time to wait before flushing a batch (in milliseconds).
+    ///
+    /// Default: 10ms
+    #[serde(default = "default_window_duration_ms")]
+    #[validate(range(min = 1, max = 10000))]
+    pub window_duration_ms: u64,
+    /// Maximum number of events in a batch before flushing.
+    ///
+    /// Default: 1024
+    #[serde(default = "default_max_batch_size")]
+    #[validate(range(min = 1, max = 65536))]
+    pub max_batch_size: usize,
+}
+impl Default for BatchingConfig {
+    fn default() -> Self {
+        Self {
+            window_duration_ms: default_window_duration_ms(),
+            max_batch_size: default_max_batch_size(),
+        }
+    }
+}
+fn default_window_duration_ms() -> u64 {
+    10
+}
+fn default_max_batch_size() -> usize {
+    1024
+}
+/// Event emission policy configuration.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum EventPolicyConfig {
+    /// Emit events only for blocks at power-of-2 positions (default).
+    ///
+    /// This creates sparse sampling at positions 16, 32, 64, ..., 65536
+    /// for efficient radix tree construction without tracking every block.
+    #[default]
+    PowerOfTwo,
+    /// Emit events for all blocks.
+    ///
+    /// Useful for testing or when complete block tracking is needed.
+    All,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_config() {
+        let config = EventsConfig::default();
+        assert!(!config.enabled);
+        assert_eq!(config.batching.window_duration_ms, 10);
+        assert_eq!(config.batching.max_batch_size, 1024);
+        assert_eq!(config.channel_capacity, 1024);
+        assert_eq!(config.subject, "kvbm.events");
+        assert_eq!(config.policy, EventPolicyConfig::PowerOfTwo);
+    }
+    #[test]
+    fn test_serde_roundtrip() {
+        let json = r#"{
+            "enabled": true,
+            "batching": {
+                "window_duration_ms": 50,
+                "max_batch_size": 512
+            },
+            "channel_capacity": 2048,
+            "subject": "my.events",
+            "policy": "all"
+        }"#;
+        let config: EventsConfig = serde_json::from_str(json).unwrap();
+        assert!(config.enabled);
+        assert_eq!(config.batching.window_duration_ms, 50);
+        assert_eq!(config.batching.max_batch_size, 512);
+        assert_eq!(config.channel_capacity, 2048);
+        assert_eq!(config.subject, "my.events");
+        assert_eq!(config.policy, EventPolicyConfig::All);
+        // Roundtrip
+        let serialized = serde_json::to_string(&config).unwrap();
+        let deserialized: EventsConfig = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(deserialized.enabled, config.enabled);
+        assert_eq!(deserialized.policy, config.policy);
+    }
+    #[test]
+    fn test_empty_json_uses_defaults() {
+        let json = r#"{}"#;
+        let config: EventsConfig = serde_json::from_str(json).unwrap();
+        assert!(!config.enabled);
+        assert_eq!(config.batching.window_duration_ms, 10);
+    }
+    #[test]
+    fn test_partial_config() {
+        // Only override enabled, everything else uses defaults
+        let json = r#"{"enabled": true}"#;
+        let config: EventsConfig = serde_json::from_str(json).unwrap();
+        assert!(config.enabled);
+        assert_eq!(config.batching.window_duration_ms, 10);
+        assert_eq!(config.channel_capacity, 1024);
+    }
+    #[test]
+    fn test_validation() {
+        let config = EventsConfig {
+            enabled: true,
+            batching: BatchingConfig {
+                window_duration_ms: 10,
+                max_batch_size: 1024,
+            },
+            channel_capacity: 1024,
+            subject: "test".to_string(),
+            policy: EventPolicyConfig::PowerOfTwo,
+        };
+        assert!(config.validate().is_ok());
+    }
+}
--- a/lib/kvbm-config/src/lib.rs
+++ b/lib/kvbm-config/src/lib.rs
--- a/lib/kvbm-config/src/messenger.rs
+++ b/lib/kvbm-config/src/messenger.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Messenger transport and discovery configuration.
+use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use anyhow::{Context, Result, bail};
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+use crate::discovery::DiscoveryConfig;
+/// Messenger configuration combining backend and discovery settings.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct MessengerConfig {
+    #[validate(nested)]
+    pub backend: MessengerBackendConfig,
+    /// Discovery configuration. None = discovery disabled.
+    #[serde(default)]
+    pub discovery: Option<DiscoveryConfig>,
+}
+impl MessengerConfig {
+    /// Build a Messenger instance from this configuration.
+    ///
+    /// This creates:
+    /// 1. A TCP transport bound to the configured address
+    /// 2. A discovery backend based on the configured type (if any)
+    /// 3. A Messenger instance combining both
+    pub async fn build_messenger(&self) -> Result<std::sync::Arc<velo::Messenger>> {
+        use std::net::TcpListener;
+        use std::sync::Arc;
+        use velo::Messenger;
+        use velo::backend::tcp::TcpTransportBuilder;
+        // 1. Build TCP transport
+        // Pre-bind listener to get OS-assigned port (if port is 0)
+        let bind_addr = self.backend.resolve_bind_addr()?;
+        let listener = TcpListener::bind(bind_addr)
+            .with_context(|| format!("Failed to bind TCP listener to {}", bind_addr))?;
+        // Extract actual bound address (with real port if OS-assigned)
+        let actual_addr = listener
+            .local_addr()
+            .context("Failed to get local address from listener")?;
+        tracing::info!("Built TCP transport bound to {}", actual_addr);
+        // Build transport using from_listener to use the actual port
+        let tcp_transport = TcpTransportBuilder::new()
+            .from_listener(listener)?
+            .build()
+            .context("Failed to build TCP transport")?;
+        let tcp_transport = Arc::new(tcp_transport);
+        // 2. Build discovery backend based on configuration
+        let mut builder = Messenger::builder().add_transport(tcp_transport);
+        if let Some(discovery_config) = &self.discovery {
+            match discovery_config {
+                DiscoveryConfig::Etcd(_cfg) => {
+                    bail!("Etcd discovery not yet supported in velo");
+                }
+                DiscoveryConfig::P2p(_cfg) => {
+                    bail!("P2P discovery not yet supported in velo");
+                }
+                DiscoveryConfig::Filesystem(cfg) => {
+                    use velo::discovery::FilesystemPeerDiscovery;
+                    let peer_discovery = FilesystemPeerDiscovery::new(&cfg.path)
+                        .context("Failed to build filesystem discovery")?;
+                    builder = builder.discovery(Arc::new(peer_discovery));
+                    tracing::info!("Built filesystem discovery from: {:?}", cfg.path);
+                }
+            }
+        }
+        // 3. Build Messenger
+        let messenger = builder.build().await.context("Failed to build Messenger")?;
+        Ok(messenger)
+    }
+}
+/// Messenger backend (transport) configuration.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct MessengerBackendConfig {
+    /// IP address to bind (mutually exclusive with tcp_interface).
+    /// e.g., "0.0.0.0" or "192.168.1.100"
+    pub tcp_addr: Option<String>,
+    /// Network interface to bind (mutually exclusive with tcp_addr).
+    /// e.g., "eth0", "ens192"
+    pub tcp_interface: Option<String>,
+    /// TCP port to bind. 0 means OS-assigned (ephemeral port).
+    #[serde(default)]
+    pub tcp_port: u16,
+}
+impl MessengerBackendConfig {
+    /// Resolve the bind address from either interface name or explicit address.
+    ///
+    /// Returns error if both tcp_addr and tcp_interface are specified.
+    pub fn resolve_bind_addr(&self) -> Result<SocketAddr> {
+        let ip = match (&self.tcp_addr, &self.tcp_interface) {
+            (Some(_), Some(_)) => {
+                bail!("tcp_addr and tcp_interface are mutually exclusive")
+            }
+            (Some(addr), None) => addr
+                .parse::<IpAddr>()
+                .with_context(|| format!("Invalid IP address: {}", addr))?,
+            (None, Some(iface)) => get_interface_ip(iface)
+                .with_context(|| format!("Failed to get IP for interface: {}", iface))?,
+            (None, None) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
+        };
+        Ok(SocketAddr::new(ip, self.tcp_port))
+    }
+}
+/// Get the IP address for a network interface.
+fn get_interface_ip(interface_name: &str) -> Result<IpAddr> {
+    use nix::ifaddrs::getifaddrs;
+    let addrs = getifaddrs().context("Failed to get interface addresses")?;
+    for ifaddr in addrs {
+        if ifaddr.interface_name == interface_name
+            && let Some(addr) = ifaddr.address
+        {
+            // Prefer IPv4 addresses
+            if let Some(sockaddr) = addr.as_sockaddr_in() {
+                return Ok(IpAddr::V4(sockaddr.ip()));
+            }
+            // Fall back to IPv6 if no IPv4
+            if let Some(sockaddr) = addr.as_sockaddr_in6() {
+                return Ok(IpAddr::V6(sockaddr.ip()));
+            }
+        }
+    }
+    bail!("No IP address found for interface: {}", interface_name)
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_backend_config() {
+        let config = MessengerBackendConfig::default();
+        assert!(config.tcp_addr.is_none());
+        assert!(config.tcp_interface.is_none());
+        assert_eq!(config.tcp_port, 0);
+    }
+    #[test]
+    fn test_resolve_bind_addr_default() {
+        let config = MessengerBackendConfig::default();
+        let addr = config.resolve_bind_addr().unwrap();
+        assert_eq!(addr.ip(), IpAddr::V4(Ipv4Addr::UNSPECIFIED));
+        assert_eq!(addr.port(), 0);
+    }
+    #[test]
+    fn test_resolve_bind_addr_explicit() {
+        let config = MessengerBackendConfig {
+            tcp_addr: Some("192.168.1.100".to_string()),
+            tcp_interface: None,
+            tcp_port: 8080,
+        };
+        let addr = config.resolve_bind_addr().unwrap();
+        assert_eq!(addr.ip(), IpAddr::V4(Ipv4Addr::new(192, 168, 1, 100)));
+        assert_eq!(addr.port(), 8080);
+    }
+    #[test]
+    fn test_resolve_bind_addr_mutual_exclusivity() {
+        let config = MessengerBackendConfig {
+            tcp_addr: Some("0.0.0.0".to_string()),
+            tcp_interface: Some("eth0".to_string()),
+            tcp_port: 0,
+        };
+        let result = config.resolve_bind_addr();
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("mutually exclusive")
+        );
+    }
+}
--- a/lib/kvbm-config/src/nixl.rs
+++ b/lib/kvbm-config/src/nixl.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! NixL backend configuration.
+//!
+//! Configures which NixL backends (UCX, GDS, etc.) are enabled for RDMA transfers,
+//! along with optional parameters for each backend.
+use dynamo_memory::nixl::NixlBackendConfig;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use validator::Validate;
+/// NixL backend configuration.
+///
+/// Controls which NixL backends are enabled for RDMA memory transfers
+/// and their optional parameters.
+///
+/// # Backends
+///
+/// Common backends include:
+/// - `UCX` - Unified Communication X (default)
+/// - `GDS` - GPUDirect Storage
+/// - `GDS_MT` - GPUDirect Storage (multi-threaded)
+///
+/// All backend names are normalized to uppercase.
+///
+/// # Configuration
+///
+/// Each backend can have optional parameters as key-value pairs.
+/// If a backend has no parameters, use an empty map.
+///
+/// ## TOML Example
+///
+/// ```toml
+/// [nixl.backends.UCX]
+/// # UCX with default params (empty map)
+///
+/// [nixl.backends.GDS]
+/// threads = "4"
+/// buffer_size = "1048576"
+/// ```
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct NixlConfig {
+    /// Map of backend name (uppercase) -> optional parameters.
+    ///
+    /// If a backend is present in the map, it's enabled.
+    /// The inner HashMap contains optional override parameters.
+    /// An empty inner map means use default parameters.
+    #[serde(default = "default_backends")]
+    pub backends: HashMap<String, HashMap<String, String>>,
+}
+fn default_backends() -> HashMap<String, HashMap<String, String>> {
+    let mut backends = HashMap::new();
+    backends.insert("UCX".to_string(), HashMap::new());
+    backends.insert("POSIX".to_string(), HashMap::new());
+    backends
+}
+impl Default for NixlConfig {
+    fn default() -> Self {
+        Self {
+            backends: default_backends(),
+        }
+    }
+}
+impl NixlConfig {
+    pub fn new(backends: HashMap<String, HashMap<String, String>>) -> Self {
+        Self { backends }
+    }
+    pub fn empty() -> Self {
+        Self {
+            backends: HashMap::new(),
+        }
+    }
+    pub fn from_nixl_backend_config(config: NixlBackendConfig) -> Self {
+        let backends: HashMap<String, HashMap<String, String>> = config
+            .iter()
+            .map(|(backend, params)| (backend.to_string(), params.clone()))
+            .collect();
+        Self { backends }
+    }
+    /// Add a backend with default parameters.
+    /// Backend name is normalized to uppercase.
+    pub fn with_backend(mut self, name: impl Into<String>) -> Self {
+        self.backends
+            .insert(name.into().to_uppercase(), HashMap::new());
+        self
+    }
+    /// Add a backend with custom parameters.
+    /// Backend name is normalized to uppercase.
+    pub fn with_backend_params(
+        mut self,
+        name: impl Into<String>,
+        params: HashMap<String, String>,
+    ) -> Self {
+        self.backends.insert(name.into().to_uppercase(), params);
+        self
+    }
+    /// Get the list of enabled backend names (uppercase).
+    pub fn enabled_backends(&self) -> Vec<&String> {
+        self.backends.keys().collect()
+    }
+    /// Check if a specific backend is enabled.
+    /// Backend name is normalized to uppercase for lookup.
+    pub fn has_backend(&self, backend: &str) -> bool {
+        self.backends.contains_key(&backend.to_uppercase())
+    }
+    /// Get parameters for a specific backend.
+    /// Backend name is normalized to uppercase for lookup.
+    ///
+    /// Returns None if the backend is not enabled.
+    pub fn backend_params(&self, backend: &str) -> Option<&HashMap<String, String>> {
+        self.backends.get(&backend.to_uppercase())
+    }
+    /// Iterate over all enabled backends and their parameters.
+    pub fn iter(&self) -> impl Iterator<Item = (&String, &HashMap<String, String>)> {
+        self.backends.iter()
+    }
+}
+impl From<NixlConfig> for NixlBackendConfig {
+    fn from(config: NixlConfig) -> Self {
+        NixlBackendConfig::new(config.backends)
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_config() {
+        let config = NixlConfig::default();
+        assert!(config.has_backend("UCX"));
+        assert!(!config.has_backend("GDS"));
+    }
+    #[test]
+    fn test_new_default() {
+        let config = NixlConfig::default();
+        assert!(config.has_backend("UCX"));
+        assert!(config.has_backend("POSIX"));
+        assert!(!config.enabled_backends().is_empty());
+    }
+    #[test]
+    fn test_with_backend() {
+        let config = NixlConfig::empty().with_backend("ucx").with_backend("gds");
+        assert!(config.has_backend("UCX"));
+        assert!(config.has_backend("GDS"));
+        assert!(!config.has_backend("POSIX"));
+        // Keys are stored uppercase
+        assert!(config.backends.contains_key("UCX"));
+        assert!(config.backends.contains_key("GDS"));
+    }
+    #[test]
+    fn test_with_backend_params() {
+        let mut params = HashMap::new();
+        params.insert("threads".to_string(), "4".to_string());
+        params.insert("buffer_size".to_string(), "1048576".to_string());
+        let config = NixlConfig::empty()
+            .with_backend("UCX")
+            .with_backend_params("GDS", params);
+        // UCX should have empty params
+        let ucx_params = config.backend_params("UCX").unwrap();
+        assert!(ucx_params.is_empty());
+        // GDS should have custom params
+        let gds_params = config.backend_params("GDS").unwrap();
+        assert_eq!(gds_params.get("threads"), Some(&"4".to_string()));
+        assert_eq!(gds_params.get("buffer_size"), Some(&"1048576".to_string()));
+    }
+    #[test]
+    fn test_lookup_normalizes_to_uppercase() {
+        let config = NixlConfig::empty().with_backend("ucx");
+        // All lookups normalize to uppercase
+        assert!(config.has_backend("ucx"));
+        assert!(config.has_backend("UCX"));
+        assert!(config.has_backend("Ucx"));
+        assert!(config.backend_params("ucx").is_some());
+        assert!(config.backend_params("UCX").is_some());
+    }
+    #[test]
+    fn test_enabled_backends() {
+        let config = NixlConfig::empty().with_backend("ucx").with_backend("gds");
+        let backends = config.enabled_backends();
+        assert_eq!(backends.len(), 2);
+        assert!(backends.contains(&&"UCX".to_string()));
+        assert!(backends.contains(&&"GDS".to_string()));
+    }
+    #[test]
+    fn test_iter() {
+        let mut params = HashMap::new();
+        params.insert("key".to_string(), "value".to_string());
+        let config = NixlConfig::empty()
+            .with_backend("UCX")
+            .with_backend_params("GDS", params);
+        let items: Vec<_> = config.iter().collect();
+        assert_eq!(items.len(), 2);
+    }
+    #[test]
+    fn test_serde_roundtrip() {
+        let mut params = HashMap::new();
+        params.insert("threads".to_string(), "4".to_string());
+        let config = NixlConfig::empty()
+            .with_backend("UCX")
+            .with_backend_params("GDS", params);
+        let json = serde_json::to_string(&config).unwrap();
+        let parsed: NixlConfig = serde_json::from_str(&json).unwrap();
+        assert!(parsed.has_backend("UCX"));
+        assert!(parsed.has_backend("GDS"));
+        assert_eq!(
+            parsed.backend_params("GDS").unwrap().get("threads"),
+            Some(&"4".to_string())
+        );
+    }
+}
--- a/lib/kvbm-config/src/object.rs
+++ b/lib/kvbm-config/src/object.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Object storage configuration for KVBM.
+//!
+//! Defines configuration for object storage backends (S3, NIXL) used for
+//! the G4 tier (object storage) in the cache hierarchy.
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Top-level object storage configuration.
+///
+/// When present, enables object storage operations on workers.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct ObjectConfig {
+    /// Which object client implementation to use.
+    pub client: ObjectClientConfig,
+}
+/// Object client implementation selector.
+///
+/// Determines whether to use direct S3 client or NIXL agent for object storage.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "lowercase")]
+pub enum ObjectClientConfig {
+    /// Direct S3/MinIO client using AWS SDK.
+    S3(S3ObjectConfig),
+    /// NIXL agent with object storage backend.
+    Nixl(NixlObjectConfig),
+}
+/// S3-compatible object storage configuration.
+///
+/// Used for both direct S3 access and as a backend for NIXL.
+/// Compatible with AWS S3 and S3-compatible services like MinIO.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct S3ObjectConfig {
+    /// Custom endpoint URL for S3-compatible services (e.g., MinIO).
+    /// If None, uses the default AWS S3 endpoint.
+    #[serde(default)]
+    pub endpoint_url: Option<String>,
+    /// S3 bucket name for storing blocks.
+    pub bucket: String,
+    /// AWS region.
+    #[serde(default = "default_region")]
+    pub region: String,
+    /// Use path-style URLs instead of virtual-hosted-style.
+    /// Required for MinIO and some S3-compatible services.
+    #[serde(default)]
+    pub force_path_style: bool,
+    /// Maximum number of concurrent S3 requests.
+    #[serde(default = "default_max_concurrent")]
+    pub max_concurrent_requests: usize,
+}
+fn default_region() -> String {
+    "us-east-1".to_string()
+}
+fn default_max_concurrent() -> usize {
+    16
+}
+impl Default for S3ObjectConfig {
+    fn default() -> Self {
+        Self {
+            endpoint_url: None,
+            bucket: "kvbm-blocks".to_string(),
+            region: default_region(),
+            force_path_style: false,
+            max_concurrent_requests: default_max_concurrent(),
+        }
+    }
+}
+impl S3ObjectConfig {
+    /// Create configuration for AWS S3.
+    pub fn aws(bucket: String, region: String) -> Self {
+        Self {
+            endpoint_url: None,
+            bucket,
+            region,
+            force_path_style: false,
+            max_concurrent_requests: default_max_concurrent(),
+        }
+    }
+    /// Create configuration for MinIO or other S3-compatible services.
+    pub fn minio(endpoint_url: String, bucket: String) -> Self {
+        Self {
+            endpoint_url: Some(endpoint_url),
+            bucket,
+            region: default_region(),
+            force_path_style: true,
+            max_concurrent_requests: default_max_concurrent(),
+        }
+    }
+}
+/// NIXL object storage backend configuration.
+///
+/// NIXL can use various object storage backends. Each variant
+/// specifies the backend type and its configuration.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "backend", rename_all = "lowercase")]
+pub enum NixlObjectConfig {
+    /// S3-compatible backend via NIXL.
+    S3(S3ObjectConfig),
+    // Future backends can be added here:
+    // Gcs(GcsObjectConfig),
+    // Azure(AzureObjectConfig),
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_s3_config_default() {
+        let config = S3ObjectConfig::default();
+        assert!(config.endpoint_url.is_none());
+        assert_eq!(config.bucket, "kvbm-blocks");
+        assert_eq!(config.region, "us-east-1");
+        assert!(!config.force_path_style);
+        assert_eq!(config.max_concurrent_requests, 16);
+    }
+    #[test]
+    fn test_s3_config_aws() {
+        let config = S3ObjectConfig::aws("my-bucket".into(), "us-west-2".into());
+        assert!(config.endpoint_url.is_none());
+        assert_eq!(config.bucket, "my-bucket");
+        assert_eq!(config.region, "us-west-2");
+        assert!(!config.force_path_style);
+    }
+    #[test]
+    fn test_s3_config_minio() {
+        let config = S3ObjectConfig::minio("http://localhost:9000".into(), "test".into());
+        assert_eq!(config.endpoint_url, Some("http://localhost:9000".into()));
+        assert_eq!(config.bucket, "test");
+        assert!(config.force_path_style);
+    }
+    #[test]
+    fn test_object_config_serde_s3() {
+        let json = r#"{
+            "client": {
+                "type": "s3",
+                "bucket": "my-bucket",
+                "region": "us-west-2"
+            }
+        }"#;
+        let config: ObjectConfig = serde_json::from_str(json).unwrap();
+        match config.client {
+            ObjectClientConfig::S3(s3) => {
+                assert_eq!(s3.bucket, "my-bucket");
+                assert_eq!(s3.region, "us-west-2");
+            }
+            _ => panic!("Expected S3 config"),
+        }
+    }
+    #[test]
+    fn test_object_config_serde_nixl_s3() {
+        let json = r#"{
+            "client": {
+                "type": "nixl",
+                "backend": "s3",
+                "bucket": "nixl-bucket",
+                "endpoint_url": "http://minio:9000",
+                "force_path_style": true
+            }
+        }"#;
+        let config: ObjectConfig = serde_json::from_str(json).unwrap();
+        match config.client {
+            ObjectClientConfig::Nixl(NixlObjectConfig::S3(s3)) => {
+                assert_eq!(s3.bucket, "nixl-bucket");
+                assert_eq!(s3.endpoint_url, Some("http://minio:9000".into()));
+                assert!(s3.force_path_style);
+            }
+            _ => panic!("Expected Nixl S3 config"),
+        }
+    }
+}
--- a/lib/kvbm-config/src/offload.rs
+++ b/lib/kvbm-config/src/offload.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Offload policy configuration for KVBM.
+//!
+//! Defines configuration for offload policies that control which blocks
+//! are transferred between storage tiers (G1→G2, G2→G3).
+//!
+//! # Policy Types
+//!
+//! - `pass_all`: No filtering, all blocks pass
+//! - `presence`: Skip blocks already present in destination tier
+//! - `presence_lfu`: Presence check + LFU count threshold
+//!
+//! # Configuration
+//!
+//! Policies are configured per tier transition. Multiple policies in the
+//! `policies` list are applied in order with implicit AND logic (all must pass).
+//!
+//! ## JSON Example
+//!
+//! ```json
+//! {
+//!   "offload": {
+//!     "g1_to_g2": {
+//!       "policies": ["presence"],
+//!       "presence": {}
+//!     },
+//!     "g2_to_g3": {
+//!       "policies": ["presence_lfu"],
+//!       "presence_lfu": { "min_lfu_count": 8 }
+//!     }
+//!   }
+//! }
+//! ```
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Policy type enum for serialization.
+///
+/// Each variant corresponds to a policy implementation in the kvbm crate.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum PolicyType {
+    /// PassAllPolicy - no filtering, all blocks pass
+    PassAll,
+    /// PresenceFilter - skip blocks already in destination tier
+    Presence,
+    /// PresenceAndLFUFilter - presence check + LFU threshold
+    PresenceLfu,
+}
+/// Configuration for presence filter.
+///
+/// Currently has no parameters, but the struct exists for future extensibility
+/// and to maintain consistent configuration patterns.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct PresenceFilterConfig {}
+/// Default LFU count threshold.
+fn default_min_lfu_count() -> u32 {
+    8
+}
+/// Configuration for presence + LFU filter.
+///
+/// Combines presence checking with LFU (Least Frequently Used) count threshold.
+/// Only blocks with access count above the threshold are offloaded.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct PresenceLfuFilterConfig {
+    /// Minimum LFU count threshold for offload.
+    ///
+    /// Blocks must have been accessed more than this many times to be
+    /// considered for offload. This prevents offloading rarely-used blocks.
+    ///
+    /// Default: 8
+    #[serde(default = "default_min_lfu_count")]
+    #[validate(range(min = 1))]
+    pub min_lfu_count: u32,
+}
+impl Default for PresenceLfuFilterConfig {
+    fn default() -> Self {
+        Self {
+            min_lfu_count: default_min_lfu_count(),
+        }
+    }
+}
+/// Configuration for a tier transition (e.g., G1→G2, G2→G3).
+///
+/// Defines which policies to apply when offloading blocks between tiers.
+/// Policies are evaluated in order with implicit AND logic - a block must
+/// pass ALL policies to be transferred.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct TierOffloadConfig {
+    /// Ordered list of policies to apply (implicit AND).
+    ///
+    /// If empty, tier-specific defaults are applied by the engine.
+    /// Policies are evaluated in order; a block must pass all to be transferred.
+    #[serde(default)]
+    pub policies: Vec<PolicyType>,
+    /// Presence filter configuration.
+    ///
+    /// Used when "presence" is in the policies list.
+    #[serde(default)]
+    #[validate(nested)]
+    pub presence: PresenceFilterConfig,
+    /// Presence + LFU filter configuration.
+    ///
+    /// Used when "presence_lfu" is in the policies list.
+    #[serde(default)]
+    #[validate(nested)]
+    pub presence_lfu: PresenceLfuFilterConfig,
+}
+/// Top-level offload configuration.
+///
+/// Groups policy configurations for each tier transition.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct OffloadConfig {
+    /// G1 (GPU) → G2 (Host) offload policies.
+    #[serde(default)]
+    #[validate(nested)]
+    pub g1_to_g2: TierOffloadConfig,
+    /// G2 (Host) → G3 (Disk) offload policies.
+    #[serde(default)]
+    #[validate(nested)]
+    pub g2_to_g3: TierOffloadConfig,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_config() {
+        let config = OffloadConfig::default();
+        // Empty policies - engine applies tier-specific defaults
+        assert!(config.g1_to_g2.policies.is_empty());
+        assert!(config.g2_to_g3.policies.is_empty());
+        assert_eq!(config.g2_to_g3.presence_lfu.min_lfu_count, 8);
+    }
+    #[test]
+    fn test_policy_type_serde() {
+        let json = r#"["pass_all", "presence", "presence_lfu"]"#;
+        let policies: Vec<PolicyType> = serde_json::from_str(json).unwrap();
+        assert_eq!(policies.len(), 3);
+        assert_eq!(policies[0], PolicyType::PassAll);
+        assert_eq!(policies[1], PolicyType::Presence);
+        assert_eq!(policies[2], PolicyType::PresenceLfu);
+        // Roundtrip (serde_json doesn't add spaces after commas)
+        let serialized = serde_json::to_string(&policies).unwrap();
+        let roundtrip: Vec<PolicyType> = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(policies, roundtrip);
+    }
+    #[test]
+    fn test_tier_config_serde() {
+        let json = r#"{
+            "policies": ["presence_lfu"],
+            "presence_lfu": { "min_lfu_count": 16 }
+        }"#;
+        let config: TierOffloadConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.policies.len(), 1);
+        assert_eq!(config.policies[0], PolicyType::PresenceLfu);
+        assert_eq!(config.presence_lfu.min_lfu_count, 16);
+    }
+    #[test]
+    fn test_offload_config_serde() {
+        let json = r#"{
+            "g1_to_g2": {
+                "policies": ["presence"]
+            },
+            "g2_to_g3": {
+                "policies": ["presence_lfu"],
+                "presence_lfu": { "min_lfu_count": 4 }
+            }
+        }"#;
+        let config: OffloadConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.g1_to_g2.policies, vec![PolicyType::Presence]);
+        assert_eq!(config.g2_to_g3.policies, vec![PolicyType::PresenceLfu]);
+        assert_eq!(config.g2_to_g3.presence_lfu.min_lfu_count, 4);
+    }
+    #[test]
+    fn test_default_lfu_threshold() {
+        let json = r#"{"policies": ["presence_lfu"]}"#;
+        let config: TierOffloadConfig = serde_json::from_str(json).unwrap();
+        // Should use default of 8
+        assert_eq!(config.presence_lfu.min_lfu_count, 8);
+    }
+    #[test]
+    fn test_validation() {
+        let config = OffloadConfig::default();
+        assert!(config.validate().is_ok());
+        let config_with_lfu = OffloadConfig {
+            g2_to_g3: TierOffloadConfig {
+                policies: vec![PolicyType::PresenceLfu],
+                presence_lfu: PresenceLfuFilterConfig { min_lfu_count: 1 },
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        assert!(config_with_lfu.validate().is_ok());
+    }
+}
--- a/lib/kvbm-config/src/onboard.rs
+++ b/lib/kvbm-config/src/onboard.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Onboard configuration for KV cache loading strategies.
+//!
+//! This module defines the configuration for how external KV cache blocks
+//! are loaded (onboarded) from G2 (host memory) to G1 (GPU memory).
+use serde::{Deserialize, Serialize};
+/// Configuration for KV cache onboarding strategy.
+///
+/// Onboarding is the process of loading external KV cache blocks from
+/// G2 (host memory) into G1 (GPU memory) for use during inference.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct OnboardConfig {
+    /// The onboarding mode to use.
+    ///
+    /// - `inter`: Async out-of-band loading via Nova messages (default)
+    /// - `intra`: Synchronous layer-wise loading during forward pass
+    #[serde(default)]
+    pub mode: OnboardMode,
+}
+/// Onboarding mode for loading external KV cache blocks.
+///
+/// This determines when and how G2→G1 transfers occur during inference.
+#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum OnboardMode {
+    /// Inter-pass onboarding (default).
+    ///
+    /// Blocks are loaded asynchronously between scheduler passes via Nova
+    /// active messages to workers. The `get_num_new_matched_tokens` returns
+    /// `(Some(n), true)` to indicate async loading is in progress.
+    ///
+    /// Pros: Overlaps transfer with computation
+    /// Cons: Adds latency before first token if transfer not complete
+    #[default]
+    Inter,
+    /// Intra-pass onboarding.
+    ///
+    /// Blocks are loaded synchronously during the forward pass, layer by layer.
+    /// The `get_num_new_matched_tokens` returns `(Some(n), false)` and the
+    /// G2/G1 block pairs are passed to workers via `KvConnectorMetadata`.
+    ///
+    /// Pros: Guaranteed data availability before each layer
+    /// Cons: Serializes transfer with computation per layer
+    Intra,
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_mode_is_inter() {
+        let config = OnboardConfig::default();
+        assert_eq!(config.mode, OnboardMode::Inter);
+    }
+    #[test]
+    fn test_mode_serde_roundtrip() {
+        // Test inter mode
+        let json = r#"{"mode": "inter"}"#;
+        let config: OnboardConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.mode, OnboardMode::Inter);
+        // Test intra mode
+        let json = r#"{"mode": "intra"}"#;
+        let config: OnboardConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.mode, OnboardMode::Intra);
+    }
+    #[test]
+    fn test_empty_json_uses_default() {
+        let json = r#"{}"#;
+        let config: OnboardConfig = serde_json::from_str(json).unwrap();
+        assert_eq!(config.mode, OnboardMode::Inter);
+    }
+}
--- a/lib/kvbm-config/src/rayon.rs
+++ b/lib/kvbm-config/src/rayon.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Rayon thread pool configuration.
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Rayon thread pool configuration.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
+pub struct RayonConfig {
+    /// Number of threads in the Rayon thread pool.
+    /// If None, uses the number of logical CPUs.
+    #[validate(range(min = 1))]
+    pub num_threads: Option<usize>,
+}
+#[cfg(feature = "rayon")]
+impl RayonConfig {
+    /// Build a Rayon thread pool from this configuration.
+    pub fn build_pool(&self) -> Result<::rayon::ThreadPool, ::rayon::ThreadPoolBuildError> {
+        let mut builder = ::rayon::ThreadPoolBuilder::new();
+        if let Some(threads) = self.num_threads {
+            builder = builder.num_threads(threads);
+        }
+        builder.build()
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_config() {
+        let config = RayonConfig::default();
+        assert!(config.num_threads.is_none());
+    }
+    #[cfg(feature = "rayon")]
+    #[test]
+    fn test_build_pool() {
+        let config = RayonConfig {
+            num_threads: Some(2),
+        };
+        let pool = config.build_pool().expect("Failed to build pool");
+        assert_eq!(pool.current_num_threads(), 2);
+    }
+}
--- a/lib/kvbm-config/src/tokio.rs
+++ b/lib/kvbm-config/src/tokio.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! Tokio runtime configuration.
+use std::sync::atomic::{AtomicUsize, Ordering};
+use serde::{Deserialize, Serialize};
+use validator::Validate;
+/// Atomic counter for assigning unique thread ranks.
+static THREAD_RANK: AtomicUsize = AtomicUsize::new(0);
+/// Tokio runtime configuration.
+#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
+pub struct TokioConfig {
+    /// Number of async worker threads.
+    /// If None, uses the number of logical CPUs.
+    #[validate(range(min = 1, max = default_max_cpus()))]
+    pub worker_threads: Option<usize>,
+    /// Maximum number of blocking threads.
+    /// If None, uses Tokio's default (512).
+    #[validate(range(min = 1))]
+    pub max_blocking_threads: Option<usize>,
+}
+impl TokioConfig {
+    /// Build a Tokio runtime from this configuration.
+    pub fn build_runtime(&self) -> std::io::Result<::tokio::runtime::Runtime> {
+        let mut builder = ::tokio::runtime::Builder::new_multi_thread();
+        if let Some(threads) = self.worker_threads {
+            builder.worker_threads(threads);
+        }
+        if let Some(blocking) = self.max_blocking_threads {
+            builder.max_blocking_threads(blocking);
+        }
+        builder
+            .on_thread_start(|| {
+                let rank = THREAD_RANK.fetch_add(1, Ordering::Relaxed);
+                #[cfg(feature = "nvtx")]
+                nvtx::name_thread!("kvbm-tokio:{}", rank);
+                #[cfg(not(feature = "nvtx"))]
+                let _ = rank;
+            })
+            .enable_all()
+            .build()
+    }
+}
+impl Default for TokioConfig {
+    fn default() -> Self {
+        Self {
+            worker_threads: Some(1),
+            max_blocking_threads: None,
+        }
+    }
+}
+fn default_max_cpus() -> usize {
+    std::thread::available_parallelism()
+        .unwrap_or_else(|_| std::num::NonZeroUsize::new(4).expect("4 is non-zero"))
+        .get()
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_default_config() {
+        let config = TokioConfig::default();
+        // Default uses 1 worker thread to minimize resource usage
+        assert_eq!(config.worker_threads, Some(1));
+        assert!(config.max_blocking_threads.is_none());
+    }
+    #[test]
+    fn test_build_runtime_with_defaults() {
+        let config = TokioConfig::default();
+        let runtime = config.build_runtime().expect("Failed to build runtime");
+        drop(runtime);
+    }
+    #[test]
+    fn test_build_runtime_with_custom_threads() {
+        let config = TokioConfig {
+            worker_threads: Some(2),
+            max_blocking_threads: Some(4),
+        };
+        let runtime = config.build_runtime().expect("Failed to build runtime");
+        drop(runtime);
+    }
+}
--- a/lib/kvbm-engine/CLAUDE.md
+++ b/lib/kvbm-engine/CLAUDE.md
+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Build & Test
+This is a Rust crate (`kvbm-engine`) in the dynamo workspace. Rust edition 2024, requires rustc 1.93.1+.
+```bash
+# Build
+cargo build -p kvbm-engine
+cargo build -p kvbm-engine --features s3,testing,nats
+# Test (most tests require the `testing` feature)
+cargo test -p kvbm-engine --features testing
+cargo test -p kvbm-engine --features testing -- test_name  # single test
+# Lint
+cargo clippy -p kvbm-engine --all-features
+cargo fmt
+cargo machete
+```
+## Feature Flags
+| Flag | Purpose |
+|------|---------|
+| `s3` (default) | S3/MinIO object storage (G4 tier) |
+| `testing` | Test utilities, mock infrastructure, fixtures |
+| `nats` | NATS-based pub/sub transport |
+| `collectives` | NIXL + NCCL multi-GPU collectives |
+| `nccl` | NCCL via cudarc |
+| `nvtx` | NVIDIA Tools Extension profiling markers |
+## Architecture
+kvbm-engine implements distributed coordination for KV cache block management across a tiered storage hierarchy:
+- **G1** (GPU HBM) → **G2** (Pinned DRAM) → **G3** (NVMe/SSD) → **G4** (S3/MinIO)
+Leaders own block metadata and make placement decisions. Workers execute data transfers (RDMA, NVMe, object storage). Sessions coordinate multi-instance block transfers between leaders and workers.
+### Key Modules
+- **`leader/`** — `InstanceLeader` coordinates block lookups (`find_matches`), holds blocks via RAII `BlockHolder`, and manages distributed sessions. The `Leader` trait is the core coordination interface.
+- **`leader/session/`** — Distributed session protocol: `InitiatorSession` (requester), `ResponderSession` (provider), `ServerSession` (server-side block exposure with optional G3→G2 staging). Sessions track onboarding state: Searching → Holding → Staging → Ready → Complete.
+- **`worker/`** — `PhysicalWorker` owns a `TransferManager` and layout handles for actual transfers. `CoordinatedWorker` wraps any `Worker` with the leader's coordination state. The `Worker` and `WorkerTransfers` traits define the execution contract.
+- **`worker/group/`** — `SpmdParallelWorkers` broadcasts operations to all workers in parallel (SPMD model) with event aggregation.
+- **`worker/velo/`** — RPC layer (`VeloWorkerService`/`VeloWorkerClient`) for remote worker execution via Velo.
+- **`offload/`** — Multi-stage async pipeline for tier demotion: PolicyEvaluator → PreconditionAwaiter → Batcher → TransferExecutor. Supports per-container cancellation tokens. **See `src/offload/AGENTS.md` for governance rules before modifying this module.**
+- **`object/`** — `ObjectBlockOps` trait for G4 storage. S3 implementation with concurrent uploads/downloads. `ObjectLockManager` for distributed locking via conditional S3 PUTs.
+- **`runtime/`** — `KvbmRuntime` bundles tokio, Velo messenger, NixlAgent (RDMA), and EventManager. Built via `KvbmRuntimeBuilder` or quick constructors (`from_env_leader`, `from_env_worker`).
+- **`pubsub/`** — Publisher/Subscriber traits with NATS and in-memory stub implementations.
+- **`collectives/`** — `CollectiveOps` trait for multi-GPU sync. NCCL implementation and stub for testing. MLA pattern: only rank 0 needs G2/G3; others receive via broadcast.
+- **`testing/`** — Feature-gated test utilities: `TestManagerBuilder`, `MessengerPair`, `TestSession`, `EventsPipelineFixture`, `MultiInstancePopulator`, `TestAgent`.
+### Documentation
+Module docs live in `docs/` and are included via `#[doc = include_str!("../docs/...")]`. When modifying a module, update the corresponding doc file.
+### Key Patterns
+- **Trait-based abstraction**: `Leader`, `Worker`, `WorkerTransfers`, `ObjectBlockOps`, `CollectiveOps`, `KeyFormatter` — implementations are swappable (real vs. test stubs).
+- **RAII resource management**: `BlockHolder` holds blocks during sessions with automatic release on drop. `TransferHandle` tracks offload operations.
+- **Builder pattern**: `InstanceLeaderBuilder`, `PhysicalWorkerBuilder`, `KvbmRuntimeBuilder`, `OffloadEngineBuilder`.
+- **Execution vs. coordination state**: `PhysicalWorker` owns execution state; `CoordinatedWorker` adds the leader's coordination view. Same API regardless of worker locality.
+### Workspace Dependencies
+Internal crates: `kvbm-common`, `kvbm-config`, `kvbm-kernels`, `kvbm-logical`, `kvbm-physical`, `velo`, `dynamo-tokens`, `dynamo-memory`.
+## Offload Module Governance
+The offload module (`src/offload/`) has explicit policies (P1–P6) documented in its README. Before modifying offload code, read `src/offload/AGENTS.md` and the offload docs (`docs/offload.md`, `docs/offload-developer.md`). Off-policy changes require user approval before implementation.
--- a/lib/kvbm-engine/Cargo.toml
+++ b/lib/kvbm-engine/Cargo.toml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+[package]
+name = "kvbm-engine"
+version = "1.0.0"
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+description = "Distributed coordination primitives for KVBM"
+[dependencies]
+kvbm-common = { workspace = true }
+kvbm-config = { workspace = true }
+kvbm-logical = { workspace = true }
+kvbm-physical = { workspace = true }
+velo = { workspace = true }
+# Workspace deps
+dynamo-memory = { workspace = true }
+anyhow = { workspace = true }
+dashmap = { workspace = true }
+derive_builder = { workspace = true }
+futures = { workspace = true }
+parking_lot = { workspace = true }
+rmp-serde = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+tokio = { workspace = true }
+tokio-stream = { workspace = true }
+tracing = { workspace = true }
+uuid = { workspace = true }
+# Non-workspace
+bytes = "1.10"
+crossbeam-queue = "0.3"
+oneshot = "0.1.11"
+# Optional
+cudarc = { workspace = true, optional = true }
+aws-sdk-s3 = { version = "1.120.0", optional = true }
+aws-config = { version = "1.8.11", optional = true }
+rayon = { version = "1", optional = true }
+tokio-rayon = { version = "2", optional = true }
+chrono = { version = "0.4", optional = true }
+async-nats = { workspace = true, optional = true }
+flume = { version = "0.11", optional = true }
+clap = { version = "4", features = ["derive"], optional = true }
+figment = { version = "0.10", features = ["env", "toml"], optional = true }
+libc = { version = "0.2", optional = true }
+tracing-subscriber = { workspace = true, optional = true }
+nvtx = { version = "1.3", optional = true }
+[features]
+default = ["s3"]
+s3 = ["dep:aws-sdk-s3", "dep:aws-config", "dep:rayon", "dep:tokio-rayon", "dep:chrono"]
+collectives = ["nccl"]
+nccl = ["dep:cudarc"]
+testing-nccl = ["collectives"]
+nats = ["dep:async-nats", "dep:flume"]
+testing = ["kvbm-logical/testing", "kvbm-physical/testing"]
+testing-s3 = ["s3", "testing"]
+nvtx = ["kvbm-config/nvtx", "dep:nvtx"]
+bench = ["dep:clap", "dep:figment", "dep:libc", "dep:tracing-subscriber", "dep:chrono", "testing"]
+[[bin]]
+name = "bench_engine"
+path = "bin/bench_engine.rs"
+required-features = ["bench"]
+[package.metadata.cargo-machete]
+ignored = ["rayon"]
--- a/lib/kvbm-engine/README.md
+++ b/lib/kvbm-engine/README.md
+# kvbm-engine
+Distributed coordination primitives for KV cache block management (KVBM).
+This crate implements the leader/worker architecture for managing KV cache blocks across a tiered storage hierarchy:
+**G1** (GPU HBM) → **G2** (Pinned DRAM) → **G3** (NVMe/SSD) → **G4** (S3/MinIO)
+Leaders own block metadata and make placement decisions. Workers execute data transfers (RDMA, NVMe, object storage). Sessions coordinate multi-instance block transfers.
+## Feature Flags
+| Flag           | Purpose                                  |
+| -------------- | ---------------------------------------- |
+| `s3` (default) | S3/MinIO object storage (G4 tier)        |
+| `testing`      | Test utilities and mock infrastructure   |
+| `nats`         | NATS-based pub/sub transport             |
+| `collectives`  | NIXL + NCCL multi-GPU collectives        |
+| `nccl`         | NCCL via cudarc                          |
+| `nvtx`         | NVIDIA Tools Extension profiling markers |
+## Documentation
+Detailed module documentation lives in `[docs/](docs/)`:
+- [Architecture](docs/architecture.md) — Overall system design
+- [Leader](docs/leader.md) — Block coordination and metadata management
+- [Session](docs/session.md) — Distributed onboarding protocol
+- [Worker](docs/worker.md) — Transfer execution
+- [Worker Group](docs/worker-group.md) — SPMD parallel workers
+- [Offload](docs/offload.md) — Async tier-demotion pipeline
+- [Offload Developer Guide](docs/offload-developer.md) — Contributing to the offload module
+- [Object Storage](docs/object.md) — S3/MinIO integration
+- [Runtime](docs/runtime.md) — Runtime bundle (tokio, Velo, NIXL)
+- [Testing](docs/testing.md) — Test utilities and fixtures