"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "bf9c93c7e0cd644433237aae8f72a46e99608b61"
Unverified Commit 008683d6 authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: adding kvbm-engine (#6773)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent cf79c4fc
This diff is collapsed.
...@@ -11,6 +11,8 @@ members = [ ...@@ -11,6 +11,8 @@ members = [
"lib/kv-router", "lib/kv-router",
"lib/memory", "lib/memory",
"lib/kvbm-common", "lib/kvbm-common",
"lib/kvbm-config",
"lib/kvbm-engine",
"lib/kvbm-kernels", "lib/kvbm-kernels",
"lib/kvbm-logical", "lib/kvbm-logical",
"lib/kvbm-physical", "lib/kvbm-physical",
...@@ -19,9 +21,6 @@ members = [ ...@@ -19,9 +21,6 @@ members = [
"lib/bench", "lib/bench",
"lib/bindings/c", "lib/bindings/c",
"lib/bindings/python/codegen", "lib/bindings/python/codegen",
"lib/velo-common",
"lib/velo-transports",
"lib/velo-events",
] ]
resolver = "3" resolver = "3"
...@@ -48,16 +47,17 @@ dynamo-protocols = { path = "lib/protocols", version = "1.0.0" } ...@@ -48,16 +47,17 @@ dynamo-protocols = { path = "lib/protocols", version = "1.0.0" }
dynamo-parsers = { path = "lib/parsers", version = "1.0.0" } dynamo-parsers = { path = "lib/parsers", version = "1.0.0" }
fastokens = { version = "0.1.0" } fastokens = { version = "0.1.0" }
# kvbm # kvbm
kvbm-common = { path = "lib/kvbm-common", version = "0.1.0" } kvbm-common = { path = "lib/kvbm-common", version = "1.0.0" }
kvbm-kernels = { path = "lib/kvbm-kernels", version = "0.1.0" } kvbm-config = { path = "lib/kvbm-config", version = "1.0.0" }
kvbm-logical = { path = "lib/kvbm-logical", version = "0.1.0" } kvbm-engine = { path = "lib/kvbm-engine", version = "1.0.0" }
kvbm-physical = { path = "lib/kvbm-physical", version = "0.1.0" } kvbm-kernels = { path = "lib/kvbm-kernels", version = "1.0.0" }
kvbm-logical = { path = "lib/kvbm-logical", version = "1.0.0" }
kvbm-physical = { path = "lib/kvbm-physical", version = "1.0.0" }
# velo # velo
velo-common = { path = "lib/velo-common", version = "0.1.0" } velo = { version = "0.1.0" }
velo-transports = { path = "lib/velo-transports", version = "0.1.0" }
velo-events = { path = "lib/velo-events", version = "0.1.0" }
# External dependencies # External dependencies
anyhow = { version = "1" } anyhow = { version = "1" }
......
...@@ -1249,19 +1249,6 @@ dependencies = [ ...@@ -1249,19 +1249,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "dashmap"
version = "5.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
dependencies = [
"cfg-if",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core",
]
[[package]] [[package]]
name = "dashmap" name = "dashmap"
version = "6.1.0" version = "6.1.0"
...@@ -1523,7 +1510,7 @@ version = "1.0.0" ...@@ -1523,7 +1510,7 @@ version = "1.0.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-runtime", "dynamo-runtime",
...@@ -1566,10 +1553,11 @@ dependencies = [ ...@@ -1566,10 +1553,11 @@ dependencies = [
"bytes", "bytes",
"chrono", "chrono",
"cudarc", "cudarc",
"dashmap 5.5.3", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dialoguer", "dialoguer",
"dynamo-config",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-memory", "dynamo-memory",
"dynamo-mocker", "dynamo-mocker",
...@@ -1636,6 +1624,7 @@ dependencies = [ ...@@ -1636,6 +1624,7 @@ dependencies = [
"anyhow", "anyhow",
"cudarc", "cudarc",
"libc", "libc",
"libloading 0.8.9",
"nix 0.30.1", "nix 0.30.1",
"nixl-sys", "nixl-sys",
"offset-allocator", "offset-allocator",
...@@ -1649,7 +1638,7 @@ name = "dynamo-mocker" ...@@ -1649,7 +1638,7 @@ name = "dynamo-mocker"
version = "1.0.0" version = "1.0.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-kv-router", "dynamo-kv-router",
...@@ -1718,7 +1707,7 @@ dependencies = [ ...@@ -1718,7 +1707,7 @@ dependencies = [
"blake3", "blake3",
"bytes", "bytes",
"chrono", "chrono",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-config", "dynamo-config",
...@@ -1775,7 +1764,7 @@ version = "1.0.0" ...@@ -1775,7 +1764,7 @@ version = "1.0.0"
dependencies = [ dependencies = [
"bs58", "bs58",
"bytemuck", "bytemuck",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"serde", "serde",
"thiserror 2.0.18", "thiserror 2.0.18",
...@@ -2014,9 +2003,9 @@ dependencies = [ ...@@ -2014,9 +2003,9 @@ dependencies = [
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.4.0" version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
dependencies = [ dependencies = [
"getrandom 0.3.4", "getrandom 0.3.4",
] ]
......
...@@ -1267,19 +1267,6 @@ dependencies = [ ...@@ -1267,19 +1267,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "dashmap"
version = "5.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
dependencies = [
"cfg-if",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core",
]
[[package]] [[package]]
name = "dashmap" name = "dashmap"
version = "6.1.0" version = "6.1.0"
...@@ -1532,7 +1519,7 @@ dependencies = [ ...@@ -1532,7 +1519,7 @@ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
"axum", "axum",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-runtime", "dynamo-runtime",
...@@ -1578,10 +1565,11 @@ dependencies = [ ...@@ -1578,10 +1565,11 @@ dependencies = [
"bytes", "bytes",
"chrono", "chrono",
"cudarc", "cudarc",
"dashmap 5.5.3", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dialoguer", "dialoguer",
"dynamo-config",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-memory", "dynamo-memory",
"dynamo-mocker", "dynamo-mocker",
...@@ -1651,6 +1639,7 @@ dependencies = [ ...@@ -1651,6 +1639,7 @@ dependencies = [
"anyhow", "anyhow",
"cudarc", "cudarc",
"libc", "libc",
"libloading 0.8.9",
"nix 0.30.1", "nix 0.30.1",
"nixl-sys", "nixl-sys",
"offset-allocator", "offset-allocator",
...@@ -1664,7 +1653,7 @@ name = "dynamo-mocker" ...@@ -1664,7 +1653,7 @@ name = "dynamo-mocker"
version = "1.0.0" version = "1.0.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-kv-router", "dynamo-kv-router",
...@@ -1725,7 +1714,7 @@ dependencies = [ ...@@ -1725,7 +1714,7 @@ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
"clap", "clap",
"dashmap 6.1.0", "dashmap",
"dynamo-kv-router", "dynamo-kv-router",
"dynamo-llm", "dynamo-llm",
"dynamo-mocker", "dynamo-mocker",
...@@ -1765,7 +1754,7 @@ dependencies = [ ...@@ -1765,7 +1754,7 @@ dependencies = [
"bytes", "bytes",
"chrono", "chrono",
"cudarc", "cudarc",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"derive_builder", "derive_builder",
"dynamo-config", "dynamo-config",
...@@ -1822,7 +1811,7 @@ version = "1.0.0" ...@@ -1822,7 +1811,7 @@ version = "1.0.0"
dependencies = [ dependencies = [
"bs58", "bs58",
"bytemuck", "bytemuck",
"dashmap 6.1.0", "dashmap",
"derive-getters", "derive-getters",
"serde", "serde",
"thiserror 2.0.18", "thiserror 2.0.18",
...@@ -2061,9 +2050,9 @@ dependencies = [ ...@@ -2061,9 +2050,9 @@ dependencies = [
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "2.4.0" version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f" checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
dependencies = [ dependencies = [
"getrandom 0.3.4", "getrandom 0.3.4",
] ]
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
[package] [package]
name = "kvbm-common" name = "kvbm-common"
version = "0.1.0" version = "1.0.0"
edition.workspace = true edition.workspace = true
description.workspace = true description.workspace = true
authors.workspace = true authors.workspace = true
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-config"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
description = "KVBM Configuration Library for Tokio, Rayon, and Messenger runtimes"
[features]
default = []
rayon = ["dep:rayon"]
nvtx = ["dep:nvtx"]
[dependencies]
anyhow = { workspace = true }
figment = { version = "0.10", features = ["env", "toml", "json"] }
nix = { version = "0.30.1", features = ["net"] }
serde = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
validator = { workspace = true }
# Optional dependencies
rayon = { version = "1.10", optional = true }
nvtx = { version = "1.3", optional = true }
# Velo dependencies
velo = { workspace = true }
# Memory dependencies (for NixL)
dynamo-memory = { workspace = true }
[dev-dependencies]
serde_json = { workspace = true }
temp-env = { version = "0.3.6" }
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Cache tier configuration for KVBM.
//!
//! Defines configuration for G2 (host/pinned memory) and G3 (disk) cache tiers,
//! as well as the parallelism mode for distributed workers.
//!
//! The leader uses this configuration to coordinate cache tier creation on workers.
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Parallelism strategy for KV cache across workers.
///
/// This determines how KV blocks are distributed and transferred across
/// multiple workers in a distributed inference setup.
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ParallelismMode {
/// Tensor parallel: each worker has a shard of each KV block.
///
/// This is the standard approach for tensor-parallel inference where
/// attention heads are split across workers. Each worker stores and
/// transfers only its portion of each KV block.
///
/// All workers have G1, G2, and G3 tiers. Operations execute on all
/// workers simultaneously (SPMD).
#[default]
TensorParallel,
/// Replicated data: all workers have full KV blocks (MLA scenario).
///
/// In MLA (Multi-head Latent Attention) architectures, KV blocks are
/// replicated rather than sharded. Only rank 0 has G2/G3 storage;
/// data is broadcast to other ranks after loading to G1.
///
/// This reduces storage requirements on non-rank-0 workers and is
/// suitable when the model's KV representation is the same across
/// all attention heads.
ReplicatedData,
}
/// Host cache configuration (G2 tier - pinned CPU memory).
///
/// The host cache provides a staging area for KV blocks between GPU and disk.
/// Memory is allocated as pinned (page-locked) for efficient DMA transfers.
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Default)]
pub struct HostCacheConfig {
/// Cache size in gigabytes.
/// Used to compute num_blocks if not explicitly set.
pub cache_size_gb: Option<f64>,
/// Explicit number of blocks for the host cache.
/// Takes priority over cache_size_gb if set.
pub num_blocks: Option<usize>,
}
impl HostCacheConfig {
/// Compute the number of blocks based on configuration and block size.
///
/// Priority: explicit num_blocks > computed from cache_size_gb
///
/// # Arguments
/// * `bytes_per_block` - Size of each block in bytes
///
/// # Returns
/// Number of blocks, or None if neither num_blocks nor cache_size_gb is set,
/// or if bytes_per_block is zero.
pub fn compute_num_blocks(&self, bytes_per_block: usize) -> Option<usize> {
if bytes_per_block == 0 {
return None;
}
self.num_blocks.or_else(|| {
self.cache_size_gb.map(|gb| {
// Convert GB to bytes and divide by block size
((gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
})
})
}
/// Check if host cache is enabled (has any configuration).
pub fn is_enabled(&self) -> bool {
self.num_blocks.is_some() || self.cache_size_gb.is_some()
}
}
/// Disk cache configuration (G3 tier - persistent storage).
///
/// The disk cache provides extended capacity for KV blocks beyond GPU and host memory.
/// Can use either GPU Direct Storage (GDS) for direct GPU-disk transfers or POSIX
/// for regular file I/O.
#[derive(Debug, Clone, Serialize, Deserialize, Validate, Default)]
pub struct DiskCacheConfig {
/// Cache size in gigabytes.
/// Used to compute num_blocks if not explicitly set.
pub cache_size_gb: Option<f64>,
/// Explicit number of blocks for the disk cache.
/// Takes priority over cache_size_gb if set.
pub num_blocks: Option<usize>,
/// Use GPU Direct Storage (GDS) if available.
/// When true, enables GDS_MT backend for direct GPU-disk transfers.
/// When false or GDS unavailable, falls back to POSIX backend.
#[serde(default)]
pub use_gds: bool,
/// Storage path for disk cache files.
/// If None, a default path will be used.
pub storage_path: Option<PathBuf>,
}
impl DiskCacheConfig {
/// Compute the number of blocks based on configuration and block size.
///
/// Priority: explicit num_blocks > computed from cache_size_gb
///
/// # Arguments
/// * `bytes_per_block` - Size of each block in bytes
///
/// # Returns
/// Number of blocks, or None if neither num_blocks nor cache_size_gb is set,
/// or if bytes_per_block is zero.
pub fn compute_num_blocks(&self, bytes_per_block: usize) -> Option<usize> {
if bytes_per_block == 0 {
return None;
}
self.num_blocks.or_else(|| {
self.cache_size_gb.map(|gb| {
// Convert GB to bytes and divide by block size
((gb * 1_000_000_000.0) / bytes_per_block as f64) as usize
})
})
}
/// Check if disk cache is enabled (has any configuration).
pub fn is_enabled(&self) -> bool {
self.num_blocks.is_some() || self.cache_size_gb.is_some()
}
}
/// Top-level cache configuration.
///
/// Groups host (G2) and disk (G3) cache configurations together,
/// plus the parallelism mode for distributed workers.
///
/// Use Figment profiles to configure different cache settings for leader vs worker.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct CacheConfig {
/// Host cache (G2 tier) - pinned CPU memory.
#[serde(default)]
#[validate(nested)]
pub host: HostCacheConfig,
/// Disk cache (G3 tier) - persistent storage.
/// Optional - only configure if disk caching is needed.
#[validate(nested)]
pub disk: Option<DiskCacheConfig>,
/// Parallelism mode for distributed workers.
///
/// - `TensorParallel` (default): Each worker has a shard of each KV block
/// - `ReplicatedData`: Only rank 0 has G2/G3; data is broadcast on load
///
/// Can be set via env var: `KVBM_CACHE_PARALLELISM=tensor_parallel|replicated_data`
#[serde(default)]
pub parallelism: ParallelismMode,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_host_cache_default() {
let config = HostCacheConfig::default();
assert!(config.cache_size_gb.is_none());
assert!(config.num_blocks.is_none());
assert!(!config.is_enabled());
}
#[test]
fn test_host_cache_explicit_blocks() {
let config = HostCacheConfig {
num_blocks: Some(1000),
cache_size_gb: Some(10.0), // Should be ignored
};
// With 1MB blocks, explicit num_blocks takes priority
let bytes_per_block = 1_000_000;
assert_eq!(config.compute_num_blocks(bytes_per_block), Some(1000));
assert!(config.is_enabled());
}
#[test]
fn test_host_cache_from_size_gb() {
let config = HostCacheConfig {
num_blocks: None,
cache_size_gb: Some(10.0), // 10 GB
};
// With 1MB blocks: 10GB / 1MB = 10,000 blocks
let bytes_per_block = 1_000_000;
assert_eq!(config.compute_num_blocks(bytes_per_block), Some(10_000));
assert!(config.is_enabled());
}
#[test]
fn test_disk_cache_default() {
let config = DiskCacheConfig::default();
assert!(config.cache_size_gb.is_none());
assert!(config.num_blocks.is_none());
assert!(!config.use_gds);
assert!(config.storage_path.is_none());
assert!(!config.is_enabled());
}
#[test]
fn test_disk_cache_with_gds() {
let config = DiskCacheConfig {
num_blocks: Some(5000),
cache_size_gb: None,
use_gds: true,
storage_path: Some(PathBuf::from("/mnt/nvme/kv_cache")),
};
assert!(config.use_gds);
assert_eq!(
config.storage_path,
Some(PathBuf::from("/mnt/nvme/kv_cache"))
);
assert!(config.is_enabled());
}
#[test]
fn test_parallelism_mode_default() {
let mode = ParallelismMode::default();
assert_eq!(mode, ParallelismMode::TensorParallel);
}
#[test]
fn test_parallelism_mode_serde() {
// Test serialization
let tp = ParallelismMode::TensorParallel;
let json = serde_json::to_string(&tp).unwrap();
assert_eq!(json, "\"tensor_parallel\"");
let rd = ParallelismMode::ReplicatedData;
let json = serde_json::to_string(&rd).unwrap();
assert_eq!(json, "\"replicated_data\"");
// Test deserialization
let mode: ParallelismMode = serde_json::from_str("\"tensor_parallel\"").unwrap();
assert_eq!(mode, ParallelismMode::TensorParallel);
let mode: ParallelismMode = serde_json::from_str("\"replicated_data\"").unwrap();
assert_eq!(mode, ParallelismMode::ReplicatedData);
}
#[test]
fn test_cache_config_with_parallelism() {
let config = CacheConfig {
host: HostCacheConfig::default(),
disk: None,
parallelism: ParallelismMode::ReplicatedData,
};
assert_eq!(config.parallelism, ParallelismMode::ReplicatedData);
}
#[test]
fn test_cache_config_default_parallelism() {
let config = CacheConfig::default();
assert_eq!(config.parallelism, ParallelismMode::TensorParallel);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Discovery configuration for Nova peer discovery.
//!
//! Supports three discovery backends:
//! - **Etcd**: Centralized discovery using etcd key-value store
//! - **P2P**: Decentralized discovery using libp2p DHT
//! - **Filesystem**: File-based discovery for development/testing
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Discovery configuration - only one type can be active at a time.
///
/// # JSON Configuration Examples
///
/// ## Etcd Discovery
/// ```json
/// {
/// "type": "etcd",
/// "cluster_id": "my-cluster",
/// "endpoints": ["http://etcd1:2379", "http://etcd2:2379"],
/// "ttl_secs": 60
/// }
/// ```
///
/// ## P2P Discovery
/// ```json
/// {
/// "type": "p2p",
/// "cluster_id": "my-cluster",
/// "listen_port": 0,
/// "bootstrap_peers": ["192.168.1.10:4001"],
/// "enable_mdns": true
/// }
/// ```
///
/// ## Filesystem Discovery
/// ```json
/// {
/// "type": "filesystem",
/// "path": "/tmp/discovery.json"
/// }
/// ```
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum DiscoveryConfig {
/// Etcd-based discovery (centralized).
Etcd(EtcdDiscoveryConfig),
/// P2P discovery using libp2p DHT (decentralized).
P2p(P2pDiscoveryConfig),
/// Filesystem-based discovery (for dev/testing).
Filesystem(FilesystemDiscoveryConfig),
}
/// Etcd discovery configuration.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct EtcdDiscoveryConfig {
/// Cluster ID / key prefix for discovery (required).
pub cluster_id: String,
/// Etcd endpoints (default: ["http://localhost:2379"]).
#[serde(default = "default_etcd_endpoints")]
pub endpoints: Vec<String>,
/// Lease TTL in seconds (default: 60, range: 10-600).
#[serde(default = "default_etcd_ttl")]
#[validate(range(min = 10, max = 600))]
pub ttl_secs: u64,
/// Operation timeout in seconds (default: 30).
#[serde(default = "default_operation_timeout")]
pub operation_timeout_secs: u64,
/// Max retries for operations (default: 3).
#[serde(default = "default_max_retries")]
#[validate(range(min = 0, max = 10))]
pub max_retries: u32,
}
/// P2P discovery configuration.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct P2pDiscoveryConfig {
/// Cluster ID / swarm key (required).
pub cluster_id: String,
/// Listen port (default: 0 = OS-assigned).
#[serde(default)]
pub listen_port: u16,
/// Bootstrap peer addresses.
#[serde(default)]
pub bootstrap_peers: Vec<String>,
/// DHT replication factor (default: 3).
#[serde(default = "default_replication_factor")]
pub replication_factor: usize,
/// Enable mDNS for local network discovery (default: false).
#[serde(default)]
pub enable_mdns: bool,
/// Record TTL in seconds (default: 600).
#[serde(default = "default_record_ttl")]
pub record_ttl_secs: u64,
}
/// Filesystem discovery configuration.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FilesystemDiscoveryConfig {
/// Path to the discovery JSON file.
pub path: PathBuf,
}
fn default_etcd_endpoints() -> Vec<String> {
vec!["http://localhost:2379".to_string()]
}
fn default_etcd_ttl() -> u64 {
60
}
fn default_operation_timeout() -> u64 {
30
}
fn default_max_retries() -> u32 {
3
}
fn default_replication_factor() -> usize {
3
}
fn default_record_ttl() -> u64 {
600
}
impl Default for EtcdDiscoveryConfig {
fn default() -> Self {
Self {
cluster_id: String::new(),
endpoints: default_etcd_endpoints(),
ttl_secs: default_etcd_ttl(),
operation_timeout_secs: default_operation_timeout(),
max_retries: default_max_retries(),
}
}
}
impl Default for P2pDiscoveryConfig {
fn default() -> Self {
Self {
cluster_id: String::new(),
listen_port: 0,
bootstrap_peers: Vec::new(),
replication_factor: default_replication_factor(),
enable_mdns: false,
record_ttl_secs: default_record_ttl(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_deserialize_etcd_config() {
let json = r#"{
"type": "etcd",
"cluster_id": "test-cluster",
"endpoints": ["http://etcd1:2379"],
"ttl_secs": 120
}"#;
let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
match config {
DiscoveryConfig::Etcd(etcd) => {
assert_eq!(etcd.cluster_id, "test-cluster");
assert_eq!(etcd.endpoints, vec!["http://etcd1:2379"]);
assert_eq!(etcd.ttl_secs, 120);
assert_eq!(etcd.operation_timeout_secs, 30); // default
assert_eq!(etcd.max_retries, 3); // default
}
_ => panic!("Expected Etcd config"),
}
}
#[test]
fn test_deserialize_p2p_config() {
let json = r#"{
"type": "p2p",
"cluster_id": "test-cluster",
"listen_port": 4001,
"bootstrap_peers": ["192.168.1.10:4001"],
"enable_mdns": true
}"#;
let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
match config {
DiscoveryConfig::P2p(p2p) => {
assert_eq!(p2p.cluster_id, "test-cluster");
assert_eq!(p2p.listen_port, 4001);
assert_eq!(p2p.bootstrap_peers, vec!["192.168.1.10:4001"]);
assert!(p2p.enable_mdns);
assert_eq!(p2p.replication_factor, 3); // default
assert_eq!(p2p.record_ttl_secs, 600); // default
}
_ => panic!("Expected P2p config"),
}
}
#[test]
fn test_deserialize_filesystem_config() {
let json = r#"{
"type": "filesystem",
"path": "/tmp/discovery.json"
}"#;
let config: DiscoveryConfig = serde_json::from_str(json).unwrap();
match config {
DiscoveryConfig::Filesystem(fs) => {
assert_eq!(fs.path, PathBuf::from("/tmp/discovery.json"));
}
_ => panic!("Expected Filesystem config"),
}
}
#[test]
fn test_serialize_etcd_config() {
let config = DiscoveryConfig::Etcd(EtcdDiscoveryConfig {
cluster_id: "my-cluster".to_string(),
endpoints: vec!["http://localhost:2379".to_string()],
ttl_secs: 60,
operation_timeout_secs: 30,
max_retries: 3,
});
let json = serde_json::to_string(&config).unwrap();
assert!(json.contains(r#""type":"etcd""#));
assert!(json.contains(r#""cluster_id":"my-cluster""#));
}
#[test]
fn test_etcd_default() {
let config = EtcdDiscoveryConfig::default();
assert!(config.cluster_id.is_empty());
assert_eq!(config.endpoints, vec!["http://localhost:2379"]);
assert_eq!(config.ttl_secs, 60);
assert_eq!(config.operation_timeout_secs, 30);
assert_eq!(config.max_retries, 3);
}
#[test]
fn test_p2p_default() {
let config = P2pDiscoveryConfig::default();
assert!(config.cluster_id.is_empty());
assert_eq!(config.listen_port, 0);
assert!(config.bootstrap_peers.is_empty());
assert_eq!(config.replication_factor, 3);
assert!(!config.enable_mdns);
assert_eq!(config.record_ttl_secs, 600);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Event publishing configuration for KV cache coordination.
//!
//! This module defines the configuration for the event publishing pipeline
//! that broadcasts block registration/removal events to distributed consumers
//! (e.g., KvbmHub for radix tree maintenance).
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Configuration for event publishing.
///
/// Events are broadcast when blocks are registered or removed from the cache.
/// The pipeline batches events for efficient wire transmission.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct EventsConfig {
/// Whether event publishing is enabled.
///
/// When disabled, no events are emitted and no publisher is started.
/// Default: false
#[serde(default)]
pub enabled: bool,
/// Batching configuration for the event pipeline.
#[serde(default)]
#[validate(nested)]
pub batching: BatchingConfig,
/// Broadcast channel capacity for the EventsManager.
///
/// This determines how many events can be buffered before slow
/// subscribers start lagging. Default: 1024
#[serde(default = "default_channel_capacity")]
#[validate(range(min = 16, max = 65536))]
pub channel_capacity: usize,
/// Subject/topic pattern for publishing events.
///
/// This is the NATS/messaging subject where events are published.
/// Default: "kvbm.events"
#[serde(default = "default_subject")]
pub subject: String,
/// Event emission policy.
///
/// Determines which blocks trigger events:
/// - `power_of_two`: Only emit for blocks at power-of-2 positions (default)
/// - `all`: Emit for all blocks (testing/debugging)
#[serde(default)]
pub policy: EventPolicyConfig,
}
impl Default for EventsConfig {
fn default() -> Self {
Self {
enabled: false,
batching: BatchingConfig::default(),
channel_capacity: default_channel_capacity(),
subject: default_subject(),
policy: EventPolicyConfig::default(),
}
}
}
fn default_channel_capacity() -> usize {
1024
}
fn default_subject() -> String {
"kvbm.events".to_string()
}
/// Batching configuration for the event pipeline.
///
/// Events are batched before publishing to reduce wire traffic.
/// Batches are flushed when:
/// - The window duration expires
/// - The max batch size is reached
/// - The event type switches (Create -> Remove or vice versa)
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct BatchingConfig {
/// Maximum time to wait before flushing a batch (in milliseconds).
///
/// Default: 10ms
#[serde(default = "default_window_duration_ms")]
#[validate(range(min = 1, max = 10000))]
pub window_duration_ms: u64,
/// Maximum number of events in a batch before flushing.
///
/// Default: 1024
#[serde(default = "default_max_batch_size")]
#[validate(range(min = 1, max = 65536))]
pub max_batch_size: usize,
}
impl Default for BatchingConfig {
fn default() -> Self {
Self {
window_duration_ms: default_window_duration_ms(),
max_batch_size: default_max_batch_size(),
}
}
}
fn default_window_duration_ms() -> u64 {
10
}
fn default_max_batch_size() -> usize {
1024
}
/// Event emission policy configuration.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EventPolicyConfig {
/// Emit events only for blocks at power-of-2 positions (default).
///
/// This creates sparse sampling at positions 16, 32, 64, ..., 65536
/// for efficient radix tree construction without tracking every block.
#[default]
PowerOfTwo,
/// Emit events for all blocks.
///
/// Useful for testing or when complete block tracking is needed.
All,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = EventsConfig::default();
assert!(!config.enabled);
assert_eq!(config.batching.window_duration_ms, 10);
assert_eq!(config.batching.max_batch_size, 1024);
assert_eq!(config.channel_capacity, 1024);
assert_eq!(config.subject, "kvbm.events");
assert_eq!(config.policy, EventPolicyConfig::PowerOfTwo);
}
#[test]
fn test_serde_roundtrip() {
let json = r#"{
"enabled": true,
"batching": {
"window_duration_ms": 50,
"max_batch_size": 512
},
"channel_capacity": 2048,
"subject": "my.events",
"policy": "all"
}"#;
let config: EventsConfig = serde_json::from_str(json).unwrap();
assert!(config.enabled);
assert_eq!(config.batching.window_duration_ms, 50);
assert_eq!(config.batching.max_batch_size, 512);
assert_eq!(config.channel_capacity, 2048);
assert_eq!(config.subject, "my.events");
assert_eq!(config.policy, EventPolicyConfig::All);
// Roundtrip
let serialized = serde_json::to_string(&config).unwrap();
let deserialized: EventsConfig = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.enabled, config.enabled);
assert_eq!(deserialized.policy, config.policy);
}
#[test]
fn test_empty_json_uses_defaults() {
let json = r#"{}"#;
let config: EventsConfig = serde_json::from_str(json).unwrap();
assert!(!config.enabled);
assert_eq!(config.batching.window_duration_ms, 10);
}
#[test]
fn test_partial_config() {
// Only override enabled, everything else uses defaults
let json = r#"{"enabled": true}"#;
let config: EventsConfig = serde_json::from_str(json).unwrap();
assert!(config.enabled);
assert_eq!(config.batching.window_duration_ms, 10);
assert_eq!(config.channel_capacity, 1024);
}
#[test]
fn test_validation() {
let config = EventsConfig {
enabled: true,
batching: BatchingConfig {
window_duration_ms: 10,
max_batch_size: 1024,
},
channel_capacity: 1024,
subject: "test".to_string(),
policy: EventPolicyConfig::PowerOfTwo,
};
assert!(config.validate().is_ok());
}
}
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Messenger transport and discovery configuration.
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use anyhow::{Context, Result, bail};
use serde::{Deserialize, Serialize};
use validator::Validate;
use crate::discovery::DiscoveryConfig;
/// Messenger configuration combining backend and discovery settings.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct MessengerConfig {
#[validate(nested)]
pub backend: MessengerBackendConfig,
/// Discovery configuration. None = discovery disabled.
#[serde(default)]
pub discovery: Option<DiscoveryConfig>,
}
impl MessengerConfig {
/// Build a Messenger instance from this configuration.
///
/// This creates:
/// 1. A TCP transport bound to the configured address
/// 2. A discovery backend based on the configured type (if any)
/// 3. A Messenger instance combining both
pub async fn build_messenger(&self) -> Result<std::sync::Arc<velo::Messenger>> {
use std::net::TcpListener;
use std::sync::Arc;
use velo::Messenger;
use velo::backend::tcp::TcpTransportBuilder;
// 1. Build TCP transport
// Pre-bind listener to get OS-assigned port (if port is 0)
let bind_addr = self.backend.resolve_bind_addr()?;
let listener = TcpListener::bind(bind_addr)
.with_context(|| format!("Failed to bind TCP listener to {}", bind_addr))?;
// Extract actual bound address (with real port if OS-assigned)
let actual_addr = listener
.local_addr()
.context("Failed to get local address from listener")?;
tracing::info!("Built TCP transport bound to {}", actual_addr);
// Build transport using from_listener to use the actual port
let tcp_transport = TcpTransportBuilder::new()
.from_listener(listener)?
.build()
.context("Failed to build TCP transport")?;
let tcp_transport = Arc::new(tcp_transport);
// 2. Build discovery backend based on configuration
let mut builder = Messenger::builder().add_transport(tcp_transport);
if let Some(discovery_config) = &self.discovery {
match discovery_config {
DiscoveryConfig::Etcd(_cfg) => {
bail!("Etcd discovery not yet supported in velo");
}
DiscoveryConfig::P2p(_cfg) => {
bail!("P2P discovery not yet supported in velo");
}
DiscoveryConfig::Filesystem(cfg) => {
use velo::discovery::FilesystemPeerDiscovery;
let peer_discovery = FilesystemPeerDiscovery::new(&cfg.path)
.context("Failed to build filesystem discovery")?;
builder = builder.discovery(Arc::new(peer_discovery));
tracing::info!("Built filesystem discovery from: {:?}", cfg.path);
}
}
}
// 3. Build Messenger
let messenger = builder.build().await.context("Failed to build Messenger")?;
Ok(messenger)
}
}
/// Messenger backend (transport) configuration.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct MessengerBackendConfig {
/// IP address to bind (mutually exclusive with tcp_interface).
/// e.g., "0.0.0.0" or "192.168.1.100"
pub tcp_addr: Option<String>,
/// Network interface to bind (mutually exclusive with tcp_addr).
/// e.g., "eth0", "ens192"
pub tcp_interface: Option<String>,
/// TCP port to bind. 0 means OS-assigned (ephemeral port).
#[serde(default)]
pub tcp_port: u16,
}
impl MessengerBackendConfig {
/// Resolve the bind address from either interface name or explicit address.
///
/// Returns error if both tcp_addr and tcp_interface are specified.
pub fn resolve_bind_addr(&self) -> Result<SocketAddr> {
let ip = match (&self.tcp_addr, &self.tcp_interface) {
(Some(_), Some(_)) => {
bail!("tcp_addr and tcp_interface are mutually exclusive")
}
(Some(addr), None) => addr
.parse::<IpAddr>()
.with_context(|| format!("Invalid IP address: {}", addr))?,
(None, Some(iface)) => get_interface_ip(iface)
.with_context(|| format!("Failed to get IP for interface: {}", iface))?,
(None, None) => IpAddr::V4(Ipv4Addr::UNSPECIFIED),
};
Ok(SocketAddr::new(ip, self.tcp_port))
}
}
/// Get the IP address for a network interface.
fn get_interface_ip(interface_name: &str) -> Result<IpAddr> {
use nix::ifaddrs::getifaddrs;
let addrs = getifaddrs().context("Failed to get interface addresses")?;
for ifaddr in addrs {
if ifaddr.interface_name == interface_name
&& let Some(addr) = ifaddr.address
{
// Prefer IPv4 addresses
if let Some(sockaddr) = addr.as_sockaddr_in() {
return Ok(IpAddr::V4(sockaddr.ip()));
}
// Fall back to IPv6 if no IPv4
if let Some(sockaddr) = addr.as_sockaddr_in6() {
return Ok(IpAddr::V6(sockaddr.ip()));
}
}
}
bail!("No IP address found for interface: {}", interface_name)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_backend_config() {
let config = MessengerBackendConfig::default();
assert!(config.tcp_addr.is_none());
assert!(config.tcp_interface.is_none());
assert_eq!(config.tcp_port, 0);
}
#[test]
fn test_resolve_bind_addr_default() {
let config = MessengerBackendConfig::default();
let addr = config.resolve_bind_addr().unwrap();
assert_eq!(addr.ip(), IpAddr::V4(Ipv4Addr::UNSPECIFIED));
assert_eq!(addr.port(), 0);
}
#[test]
fn test_resolve_bind_addr_explicit() {
let config = MessengerBackendConfig {
tcp_addr: Some("192.168.1.100".to_string()),
tcp_interface: None,
tcp_port: 8080,
};
let addr = config.resolve_bind_addr().unwrap();
assert_eq!(addr.ip(), IpAddr::V4(Ipv4Addr::new(192, 168, 1, 100)));
assert_eq!(addr.port(), 8080);
}
#[test]
fn test_resolve_bind_addr_mutual_exclusivity() {
let config = MessengerBackendConfig {
tcp_addr: Some("0.0.0.0".to_string()),
tcp_interface: Some("eth0".to_string()),
tcp_port: 0,
};
let result = config.resolve_bind_addr();
assert!(result.is_err());
assert!(
result
.unwrap_err()
.to_string()
.contains("mutually exclusive")
);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! NixL backend configuration.
//!
//! Configures which NixL backends (UCX, GDS, etc.) are enabled for RDMA transfers,
//! along with optional parameters for each backend.
use dynamo_memory::nixl::NixlBackendConfig;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use validator::Validate;
/// NixL backend configuration.
///
/// Controls which NixL backends are enabled for RDMA memory transfers
/// and their optional parameters.
///
/// # Backends
///
/// Common backends include:
/// - `UCX` - Unified Communication X (default)
/// - `GDS` - GPUDirect Storage
/// - `GDS_MT` - GPUDirect Storage (multi-threaded)
///
/// All backend names are normalized to uppercase.
///
/// # Configuration
///
/// Each backend can have optional parameters as key-value pairs.
/// If a backend has no parameters, use an empty map.
///
/// ## TOML Example
///
/// ```toml
/// [nixl.backends.UCX]
/// # UCX with default params (empty map)
///
/// [nixl.backends.GDS]
/// threads = "4"
/// buffer_size = "1048576"
/// ```
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct NixlConfig {
/// Map of backend name (uppercase) -> optional parameters.
///
/// If a backend is present in the map, it's enabled.
/// The inner HashMap contains optional override parameters.
/// An empty inner map means use default parameters.
#[serde(default = "default_backends")]
pub backends: HashMap<String, HashMap<String, String>>,
}
fn default_backends() -> HashMap<String, HashMap<String, String>> {
let mut backends = HashMap::new();
backends.insert("UCX".to_string(), HashMap::new());
backends.insert("POSIX".to_string(), HashMap::new());
backends
}
impl Default for NixlConfig {
fn default() -> Self {
Self {
backends: default_backends(),
}
}
}
impl NixlConfig {
pub fn new(backends: HashMap<String, HashMap<String, String>>) -> Self {
Self { backends }
}
pub fn empty() -> Self {
Self {
backends: HashMap::new(),
}
}
pub fn from_nixl_backend_config(config: NixlBackendConfig) -> Self {
let backends: HashMap<String, HashMap<String, String>> = config
.iter()
.map(|(backend, params)| (backend.to_string(), params.clone()))
.collect();
Self { backends }
}
/// Add a backend with default parameters.
/// Backend name is normalized to uppercase.
pub fn with_backend(mut self, name: impl Into<String>) -> Self {
self.backends
.insert(name.into().to_uppercase(), HashMap::new());
self
}
/// Add a backend with custom parameters.
/// Backend name is normalized to uppercase.
pub fn with_backend_params(
mut self,
name: impl Into<String>,
params: HashMap<String, String>,
) -> Self {
self.backends.insert(name.into().to_uppercase(), params);
self
}
/// Get the list of enabled backend names (uppercase).
pub fn enabled_backends(&self) -> Vec<&String> {
self.backends.keys().collect()
}
/// Check if a specific backend is enabled.
/// Backend name is normalized to uppercase for lookup.
pub fn has_backend(&self, backend: &str) -> bool {
self.backends.contains_key(&backend.to_uppercase())
}
/// Get parameters for a specific backend.
/// Backend name is normalized to uppercase for lookup.
///
/// Returns None if the backend is not enabled.
pub fn backend_params(&self, backend: &str) -> Option<&HashMap<String, String>> {
self.backends.get(&backend.to_uppercase())
}
/// Iterate over all enabled backends and their parameters.
pub fn iter(&self) -> impl Iterator<Item = (&String, &HashMap<String, String>)> {
self.backends.iter()
}
}
impl From<NixlConfig> for NixlBackendConfig {
fn from(config: NixlConfig) -> Self {
NixlBackendConfig::new(config.backends)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = NixlConfig::default();
assert!(config.has_backend("UCX"));
assert!(!config.has_backend("GDS"));
}
#[test]
fn test_new_default() {
let config = NixlConfig::default();
assert!(config.has_backend("UCX"));
assert!(config.has_backend("POSIX"));
assert!(!config.enabled_backends().is_empty());
}
#[test]
fn test_with_backend() {
let config = NixlConfig::empty().with_backend("ucx").with_backend("gds");
assert!(config.has_backend("UCX"));
assert!(config.has_backend("GDS"));
assert!(!config.has_backend("POSIX"));
// Keys are stored uppercase
assert!(config.backends.contains_key("UCX"));
assert!(config.backends.contains_key("GDS"));
}
#[test]
fn test_with_backend_params() {
let mut params = HashMap::new();
params.insert("threads".to_string(), "4".to_string());
params.insert("buffer_size".to_string(), "1048576".to_string());
let config = NixlConfig::empty()
.with_backend("UCX")
.with_backend_params("GDS", params);
// UCX should have empty params
let ucx_params = config.backend_params("UCX").unwrap();
assert!(ucx_params.is_empty());
// GDS should have custom params
let gds_params = config.backend_params("GDS").unwrap();
assert_eq!(gds_params.get("threads"), Some(&"4".to_string()));
assert_eq!(gds_params.get("buffer_size"), Some(&"1048576".to_string()));
}
#[test]
fn test_lookup_normalizes_to_uppercase() {
let config = NixlConfig::empty().with_backend("ucx");
// All lookups normalize to uppercase
assert!(config.has_backend("ucx"));
assert!(config.has_backend("UCX"));
assert!(config.has_backend("Ucx"));
assert!(config.backend_params("ucx").is_some());
assert!(config.backend_params("UCX").is_some());
}
#[test]
fn test_enabled_backends() {
let config = NixlConfig::empty().with_backend("ucx").with_backend("gds");
let backends = config.enabled_backends();
assert_eq!(backends.len(), 2);
assert!(backends.contains(&&"UCX".to_string()));
assert!(backends.contains(&&"GDS".to_string()));
}
#[test]
fn test_iter() {
let mut params = HashMap::new();
params.insert("key".to_string(), "value".to_string());
let config = NixlConfig::empty()
.with_backend("UCX")
.with_backend_params("GDS", params);
let items: Vec<_> = config.iter().collect();
assert_eq!(items.len(), 2);
}
#[test]
fn test_serde_roundtrip() {
let mut params = HashMap::new();
params.insert("threads".to_string(), "4".to_string());
let config = NixlConfig::empty()
.with_backend("UCX")
.with_backend_params("GDS", params);
let json = serde_json::to_string(&config).unwrap();
let parsed: NixlConfig = serde_json::from_str(&json).unwrap();
assert!(parsed.has_backend("UCX"));
assert!(parsed.has_backend("GDS"));
assert_eq!(
parsed.backend_params("GDS").unwrap().get("threads"),
Some(&"4".to_string())
);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Object storage configuration for KVBM.
//!
//! Defines configuration for object storage backends (S3, NIXL) used for
//! the G4 tier (object storage) in the cache hierarchy.
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Top-level object storage configuration.
///
/// When present, enables object storage operations on workers.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct ObjectConfig {
/// Which object client implementation to use.
pub client: ObjectClientConfig,
}
/// Object client implementation selector.
///
/// Determines whether to use direct S3 client or NIXL agent for object storage.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum ObjectClientConfig {
/// Direct S3/MinIO client using AWS SDK.
S3(S3ObjectConfig),
/// NIXL agent with object storage backend.
Nixl(NixlObjectConfig),
}
/// S3-compatible object storage configuration.
///
/// Used for both direct S3 access and as a backend for NIXL.
/// Compatible with AWS S3 and S3-compatible services like MinIO.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct S3ObjectConfig {
/// Custom endpoint URL for S3-compatible services (e.g., MinIO).
/// If None, uses the default AWS S3 endpoint.
#[serde(default)]
pub endpoint_url: Option<String>,
/// S3 bucket name for storing blocks.
pub bucket: String,
/// AWS region.
#[serde(default = "default_region")]
pub region: String,
/// Use path-style URLs instead of virtual-hosted-style.
/// Required for MinIO and some S3-compatible services.
#[serde(default)]
pub force_path_style: bool,
/// Maximum number of concurrent S3 requests.
#[serde(default = "default_max_concurrent")]
pub max_concurrent_requests: usize,
}
fn default_region() -> String {
"us-east-1".to_string()
}
fn default_max_concurrent() -> usize {
16
}
impl Default for S3ObjectConfig {
fn default() -> Self {
Self {
endpoint_url: None,
bucket: "kvbm-blocks".to_string(),
region: default_region(),
force_path_style: false,
max_concurrent_requests: default_max_concurrent(),
}
}
}
impl S3ObjectConfig {
/// Create configuration for AWS S3.
pub fn aws(bucket: String, region: String) -> Self {
Self {
endpoint_url: None,
bucket,
region,
force_path_style: false,
max_concurrent_requests: default_max_concurrent(),
}
}
/// Create configuration for MinIO or other S3-compatible services.
pub fn minio(endpoint_url: String, bucket: String) -> Self {
Self {
endpoint_url: Some(endpoint_url),
bucket,
region: default_region(),
force_path_style: true,
max_concurrent_requests: default_max_concurrent(),
}
}
}
/// NIXL object storage backend configuration.
///
/// NIXL can use various object storage backends. Each variant
/// specifies the backend type and its configuration.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "backend", rename_all = "lowercase")]
pub enum NixlObjectConfig {
/// S3-compatible backend via NIXL.
S3(S3ObjectConfig),
// Future backends can be added here:
// Gcs(GcsObjectConfig),
// Azure(AzureObjectConfig),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_s3_config_default() {
let config = S3ObjectConfig::default();
assert!(config.endpoint_url.is_none());
assert_eq!(config.bucket, "kvbm-blocks");
assert_eq!(config.region, "us-east-1");
assert!(!config.force_path_style);
assert_eq!(config.max_concurrent_requests, 16);
}
#[test]
fn test_s3_config_aws() {
let config = S3ObjectConfig::aws("my-bucket".into(), "us-west-2".into());
assert!(config.endpoint_url.is_none());
assert_eq!(config.bucket, "my-bucket");
assert_eq!(config.region, "us-west-2");
assert!(!config.force_path_style);
}
#[test]
fn test_s3_config_minio() {
let config = S3ObjectConfig::minio("http://localhost:9000".into(), "test".into());
assert_eq!(config.endpoint_url, Some("http://localhost:9000".into()));
assert_eq!(config.bucket, "test");
assert!(config.force_path_style);
}
#[test]
fn test_object_config_serde_s3() {
let json = r#"{
"client": {
"type": "s3",
"bucket": "my-bucket",
"region": "us-west-2"
}
}"#;
let config: ObjectConfig = serde_json::from_str(json).unwrap();
match config.client {
ObjectClientConfig::S3(s3) => {
assert_eq!(s3.bucket, "my-bucket");
assert_eq!(s3.region, "us-west-2");
}
_ => panic!("Expected S3 config"),
}
}
#[test]
fn test_object_config_serde_nixl_s3() {
let json = r#"{
"client": {
"type": "nixl",
"backend": "s3",
"bucket": "nixl-bucket",
"endpoint_url": "http://minio:9000",
"force_path_style": true
}
}"#;
let config: ObjectConfig = serde_json::from_str(json).unwrap();
match config.client {
ObjectClientConfig::Nixl(NixlObjectConfig::S3(s3)) => {
assert_eq!(s3.bucket, "nixl-bucket");
assert_eq!(s3.endpoint_url, Some("http://minio:9000".into()));
assert!(s3.force_path_style);
}
_ => panic!("Expected Nixl S3 config"),
}
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Offload policy configuration for KVBM.
//!
//! Defines configuration for offload policies that control which blocks
//! are transferred between storage tiers (G1→G2, G2→G3).
//!
//! # Policy Types
//!
//! - `pass_all`: No filtering, all blocks pass
//! - `presence`: Skip blocks already present in destination tier
//! - `presence_lfu`: Presence check + LFU count threshold
//!
//! # Configuration
//!
//! Policies are configured per tier transition. Multiple policies in the
//! `policies` list are applied in order with implicit AND logic (all must pass).
//!
//! ## JSON Example
//!
//! ```json
//! {
//! "offload": {
//! "g1_to_g2": {
//! "policies": ["presence"],
//! "presence": {}
//! },
//! "g2_to_g3": {
//! "policies": ["presence_lfu"],
//! "presence_lfu": { "min_lfu_count": 8 }
//! }
//! }
//! }
//! ```
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Policy type enum for serialization.
///
/// Each variant corresponds to a policy implementation in the kvbm crate.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum PolicyType {
/// PassAllPolicy - no filtering, all blocks pass
PassAll,
/// PresenceFilter - skip blocks already in destination tier
Presence,
/// PresenceAndLFUFilter - presence check + LFU threshold
PresenceLfu,
}
/// Configuration for presence filter.
///
/// Currently has no parameters, but the struct exists for future extensibility
/// and to maintain consistent configuration patterns.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct PresenceFilterConfig {}
/// Default LFU count threshold.
fn default_min_lfu_count() -> u32 {
8
}
/// Configuration for presence + LFU filter.
///
/// Combines presence checking with LFU (Least Frequently Used) count threshold.
/// Only blocks with access count above the threshold are offloaded.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct PresenceLfuFilterConfig {
/// Minimum LFU count threshold for offload.
///
/// Blocks must have been accessed more than this many times to be
/// considered for offload. This prevents offloading rarely-used blocks.
///
/// Default: 8
#[serde(default = "default_min_lfu_count")]
#[validate(range(min = 1))]
pub min_lfu_count: u32,
}
impl Default for PresenceLfuFilterConfig {
fn default() -> Self {
Self {
min_lfu_count: default_min_lfu_count(),
}
}
}
/// Configuration for a tier transition (e.g., G1→G2, G2→G3).
///
/// Defines which policies to apply when offloading blocks between tiers.
/// Policies are evaluated in order with implicit AND logic - a block must
/// pass ALL policies to be transferred.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct TierOffloadConfig {
/// Ordered list of policies to apply (implicit AND).
///
/// If empty, tier-specific defaults are applied by the engine.
/// Policies are evaluated in order; a block must pass all to be transferred.
#[serde(default)]
pub policies: Vec<PolicyType>,
/// Presence filter configuration.
///
/// Used when "presence" is in the policies list.
#[serde(default)]
#[validate(nested)]
pub presence: PresenceFilterConfig,
/// Presence + LFU filter configuration.
///
/// Used when "presence_lfu" is in the policies list.
#[serde(default)]
#[validate(nested)]
pub presence_lfu: PresenceLfuFilterConfig,
}
/// Top-level offload configuration.
///
/// Groups policy configurations for each tier transition.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct OffloadConfig {
/// G1 (GPU) → G2 (Host) offload policies.
#[serde(default)]
#[validate(nested)]
pub g1_to_g2: TierOffloadConfig,
/// G2 (Host) → G3 (Disk) offload policies.
#[serde(default)]
#[validate(nested)]
pub g2_to_g3: TierOffloadConfig,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = OffloadConfig::default();
// Empty policies - engine applies tier-specific defaults
assert!(config.g1_to_g2.policies.is_empty());
assert!(config.g2_to_g3.policies.is_empty());
assert_eq!(config.g2_to_g3.presence_lfu.min_lfu_count, 8);
}
#[test]
fn test_policy_type_serde() {
let json = r#"["pass_all", "presence", "presence_lfu"]"#;
let policies: Vec<PolicyType> = serde_json::from_str(json).unwrap();
assert_eq!(policies.len(), 3);
assert_eq!(policies[0], PolicyType::PassAll);
assert_eq!(policies[1], PolicyType::Presence);
assert_eq!(policies[2], PolicyType::PresenceLfu);
// Roundtrip (serde_json doesn't add spaces after commas)
let serialized = serde_json::to_string(&policies).unwrap();
let roundtrip: Vec<PolicyType> = serde_json::from_str(&serialized).unwrap();
assert_eq!(policies, roundtrip);
}
#[test]
fn test_tier_config_serde() {
let json = r#"{
"policies": ["presence_lfu"],
"presence_lfu": { "min_lfu_count": 16 }
}"#;
let config: TierOffloadConfig = serde_json::from_str(json).unwrap();
assert_eq!(config.policies.len(), 1);
assert_eq!(config.policies[0], PolicyType::PresenceLfu);
assert_eq!(config.presence_lfu.min_lfu_count, 16);
}
#[test]
fn test_offload_config_serde() {
let json = r#"{
"g1_to_g2": {
"policies": ["presence"]
},
"g2_to_g3": {
"policies": ["presence_lfu"],
"presence_lfu": { "min_lfu_count": 4 }
}
}"#;
let config: OffloadConfig = serde_json::from_str(json).unwrap();
assert_eq!(config.g1_to_g2.policies, vec![PolicyType::Presence]);
assert_eq!(config.g2_to_g3.policies, vec![PolicyType::PresenceLfu]);
assert_eq!(config.g2_to_g3.presence_lfu.min_lfu_count, 4);
}
#[test]
fn test_default_lfu_threshold() {
let json = r#"{"policies": ["presence_lfu"]}"#;
let config: TierOffloadConfig = serde_json::from_str(json).unwrap();
// Should use default of 8
assert_eq!(config.presence_lfu.min_lfu_count, 8);
}
#[test]
fn test_validation() {
let config = OffloadConfig::default();
assert!(config.validate().is_ok());
let config_with_lfu = OffloadConfig {
g2_to_g3: TierOffloadConfig {
policies: vec![PolicyType::PresenceLfu],
presence_lfu: PresenceLfuFilterConfig { min_lfu_count: 1 },
..Default::default()
},
..Default::default()
};
assert!(config_with_lfu.validate().is_ok());
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Onboard configuration for KV cache loading strategies.
//!
//! This module defines the configuration for how external KV cache blocks
//! are loaded (onboarded) from G2 (host memory) to G1 (GPU memory).
use serde::{Deserialize, Serialize};
/// Configuration for KV cache onboarding strategy.
///
/// Onboarding is the process of loading external KV cache blocks from
/// G2 (host memory) into G1 (GPU memory) for use during inference.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct OnboardConfig {
/// The onboarding mode to use.
///
/// - `inter`: Async out-of-band loading via Nova messages (default)
/// - `intra`: Synchronous layer-wise loading during forward pass
#[serde(default)]
pub mode: OnboardMode,
}
/// Onboarding mode for loading external KV cache blocks.
///
/// This determines when and how G2→G1 transfers occur during inference.
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum OnboardMode {
/// Inter-pass onboarding (default).
///
/// Blocks are loaded asynchronously between scheduler passes via Nova
/// active messages to workers. The `get_num_new_matched_tokens` returns
/// `(Some(n), true)` to indicate async loading is in progress.
///
/// Pros: Overlaps transfer with computation
/// Cons: Adds latency before first token if transfer not complete
#[default]
Inter,
/// Intra-pass onboarding.
///
/// Blocks are loaded synchronously during the forward pass, layer by layer.
/// The `get_num_new_matched_tokens` returns `(Some(n), false)` and the
/// G2/G1 block pairs are passed to workers via `KvConnectorMetadata`.
///
/// Pros: Guaranteed data availability before each layer
/// Cons: Serializes transfer with computation per layer
Intra,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_mode_is_inter() {
let config = OnboardConfig::default();
assert_eq!(config.mode, OnboardMode::Inter);
}
#[test]
fn test_mode_serde_roundtrip() {
// Test inter mode
let json = r#"{"mode": "inter"}"#;
let config: OnboardConfig = serde_json::from_str(json).unwrap();
assert_eq!(config.mode, OnboardMode::Inter);
// Test intra mode
let json = r#"{"mode": "intra"}"#;
let config: OnboardConfig = serde_json::from_str(json).unwrap();
assert_eq!(config.mode, OnboardMode::Intra);
}
#[test]
fn test_empty_json_uses_default() {
let json = r#"{}"#;
let config: OnboardConfig = serde_json::from_str(json).unwrap();
assert_eq!(config.mode, OnboardMode::Inter);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Rayon thread pool configuration.
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Rayon thread pool configuration.
#[derive(Debug, Clone, Default, Serialize, Deserialize, Validate)]
pub struct RayonConfig {
/// Number of threads in the Rayon thread pool.
/// If None, uses the number of logical CPUs.
#[validate(range(min = 1))]
pub num_threads: Option<usize>,
}
#[cfg(feature = "rayon")]
impl RayonConfig {
/// Build a Rayon thread pool from this configuration.
pub fn build_pool(&self) -> Result<::rayon::ThreadPool, ::rayon::ThreadPoolBuildError> {
let mut builder = ::rayon::ThreadPoolBuilder::new();
if let Some(threads) = self.num_threads {
builder = builder.num_threads(threads);
}
builder.build()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = RayonConfig::default();
assert!(config.num_threads.is_none());
}
#[cfg(feature = "rayon")]
#[test]
fn test_build_pool() {
let config = RayonConfig {
num_threads: Some(2),
};
let pool = config.build_pool().expect("Failed to build pool");
assert_eq!(pool.current_num_threads(), 2);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Tokio runtime configuration.
use std::sync::atomic::{AtomicUsize, Ordering};
use serde::{Deserialize, Serialize};
use validator::Validate;
/// Atomic counter for assigning unique thread ranks.
static THREAD_RANK: AtomicUsize = AtomicUsize::new(0);
/// Tokio runtime configuration.
#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
pub struct TokioConfig {
/// Number of async worker threads.
/// If None, uses the number of logical CPUs.
#[validate(range(min = 1, max = default_max_cpus()))]
pub worker_threads: Option<usize>,
/// Maximum number of blocking threads.
/// If None, uses Tokio's default (512).
#[validate(range(min = 1))]
pub max_blocking_threads: Option<usize>,
}
impl TokioConfig {
/// Build a Tokio runtime from this configuration.
pub fn build_runtime(&self) -> std::io::Result<::tokio::runtime::Runtime> {
let mut builder = ::tokio::runtime::Builder::new_multi_thread();
if let Some(threads) = self.worker_threads {
builder.worker_threads(threads);
}
if let Some(blocking) = self.max_blocking_threads {
builder.max_blocking_threads(blocking);
}
builder
.on_thread_start(|| {
let rank = THREAD_RANK.fetch_add(1, Ordering::Relaxed);
#[cfg(feature = "nvtx")]
nvtx::name_thread!("kvbm-tokio:{}", rank);
#[cfg(not(feature = "nvtx"))]
let _ = rank;
})
.enable_all()
.build()
}
}
impl Default for TokioConfig {
fn default() -> Self {
Self {
worker_threads: Some(1),
max_blocking_threads: None,
}
}
}
fn default_max_cpus() -> usize {
std::thread::available_parallelism()
.unwrap_or_else(|_| std::num::NonZeroUsize::new(4).expect("4 is non-zero"))
.get()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = TokioConfig::default();
// Default uses 1 worker thread to minimize resource usage
assert_eq!(config.worker_threads, Some(1));
assert!(config.max_blocking_threads.is_none());
}
#[test]
fn test_build_runtime_with_defaults() {
let config = TokioConfig::default();
let runtime = config.build_runtime().expect("Failed to build runtime");
drop(runtime);
}
#[test]
fn test_build_runtime_with_custom_threads() {
let config = TokioConfig {
worker_threads: Some(2),
max_blocking_threads: Some(4),
};
let runtime = config.build_runtime().expect("Failed to build runtime");
drop(runtime);
}
}
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Build & Test
This is a Rust crate (`kvbm-engine`) in the dynamo workspace. Rust edition 2024, requires rustc 1.93.1+.
```bash
# Build
cargo build -p kvbm-engine
cargo build -p kvbm-engine --features s3,testing,nats
# Test (most tests require the `testing` feature)
cargo test -p kvbm-engine --features testing
cargo test -p kvbm-engine --features testing -- test_name # single test
# Lint
cargo clippy -p kvbm-engine --all-features
cargo fmt
cargo machete
```
## Feature Flags
| Flag | Purpose |
|------|---------|
| `s3` (default) | S3/MinIO object storage (G4 tier) |
| `testing` | Test utilities, mock infrastructure, fixtures |
| `nats` | NATS-based pub/sub transport |
| `collectives` | NIXL + NCCL multi-GPU collectives |
| `nccl` | NCCL via cudarc |
| `nvtx` | NVIDIA Tools Extension profiling markers |
## Architecture
kvbm-engine implements distributed coordination for KV cache block management across a tiered storage hierarchy:
- **G1** (GPU HBM) → **G2** (Pinned DRAM) → **G3** (NVMe/SSD) → **G4** (S3/MinIO)
Leaders own block metadata and make placement decisions. Workers execute data transfers (RDMA, NVMe, object storage). Sessions coordinate multi-instance block transfers between leaders and workers.
### Key Modules
- **`leader/`**`InstanceLeader` coordinates block lookups (`find_matches`), holds blocks via RAII `BlockHolder`, and manages distributed sessions. The `Leader` trait is the core coordination interface.
- **`leader/session/`** — Distributed session protocol: `InitiatorSession` (requester), `ResponderSession` (provider), `ServerSession` (server-side block exposure with optional G3→G2 staging). Sessions track onboarding state: Searching → Holding → Staging → Ready → Complete.
- **`worker/`**`PhysicalWorker` owns a `TransferManager` and layout handles for actual transfers. `CoordinatedWorker` wraps any `Worker` with the leader's coordination state. The `Worker` and `WorkerTransfers` traits define the execution contract.
- **`worker/group/`**`SpmdParallelWorkers` broadcasts operations to all workers in parallel (SPMD model) with event aggregation.
- **`worker/velo/`** — RPC layer (`VeloWorkerService`/`VeloWorkerClient`) for remote worker execution via Velo.
- **`offload/`** — Multi-stage async pipeline for tier demotion: PolicyEvaluator → PreconditionAwaiter → Batcher → TransferExecutor. Supports per-container cancellation tokens. **See `src/offload/AGENTS.md` for governance rules before modifying this module.**
- **`object/`**`ObjectBlockOps` trait for G4 storage. S3 implementation with concurrent uploads/downloads. `ObjectLockManager` for distributed locking via conditional S3 PUTs.
- **`runtime/`**`KvbmRuntime` bundles tokio, Velo messenger, NixlAgent (RDMA), and EventManager. Built via `KvbmRuntimeBuilder` or quick constructors (`from_env_leader`, `from_env_worker`).
- **`pubsub/`** — Publisher/Subscriber traits with NATS and in-memory stub implementations.
- **`collectives/`**`CollectiveOps` trait for multi-GPU sync. NCCL implementation and stub for testing. MLA pattern: only rank 0 needs G2/G3; others receive via broadcast.
- **`testing/`** — Feature-gated test utilities: `TestManagerBuilder`, `MessengerPair`, `TestSession`, `EventsPipelineFixture`, `MultiInstancePopulator`, `TestAgent`.
### Documentation
Module docs live in `docs/` and are included via `#[doc = include_str!("../docs/...")]`. When modifying a module, update the corresponding doc file.
### Key Patterns
- **Trait-based abstraction**: `Leader`, `Worker`, `WorkerTransfers`, `ObjectBlockOps`, `CollectiveOps`, `KeyFormatter` — implementations are swappable (real vs. test stubs).
- **RAII resource management**: `BlockHolder` holds blocks during sessions with automatic release on drop. `TransferHandle` tracks offload operations.
- **Builder pattern**: `InstanceLeaderBuilder`, `PhysicalWorkerBuilder`, `KvbmRuntimeBuilder`, `OffloadEngineBuilder`.
- **Execution vs. coordination state**: `PhysicalWorker` owns execution state; `CoordinatedWorker` adds the leader's coordination view. Same API regardless of worker locality.
### Workspace Dependencies
Internal crates: `kvbm-common`, `kvbm-config`, `kvbm-kernels`, `kvbm-logical`, `kvbm-physical`, `velo`, `dynamo-tokens`, `dynamo-memory`.
## Offload Module Governance
The offload module (`src/offload/`) has explicit policies (P1–P6) documented in its README. Before modifying offload code, read `src/offload/AGENTS.md` and the offload docs (`docs/offload.md`, `docs/offload-developer.md`). Off-policy changes require user approval before implementation.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
[package]
name = "kvbm-engine"
version = "1.0.0"
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
description = "Distributed coordination primitives for KVBM"
[dependencies]
kvbm-common = { workspace = true }
kvbm-config = { workspace = true }
kvbm-logical = { workspace = true }
kvbm-physical = { workspace = true }
velo = { workspace = true }
# Workspace deps
dynamo-memory = { workspace = true }
anyhow = { workspace = true }
dashmap = { workspace = true }
derive_builder = { workspace = true }
futures = { workspace = true }
parking_lot = { workspace = true }
rmp-serde = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
tokio-stream = { workspace = true }
tracing = { workspace = true }
uuid = { workspace = true }
# Non-workspace
bytes = "1.10"
crossbeam-queue = "0.3"
oneshot = "0.1.11"
# Optional
cudarc = { workspace = true, optional = true }
aws-sdk-s3 = { version = "1.120.0", optional = true }
aws-config = { version = "1.8.11", optional = true }
rayon = { version = "1", optional = true }
tokio-rayon = { version = "2", optional = true }
chrono = { version = "0.4", optional = true }
async-nats = { workspace = true, optional = true }
flume = { version = "0.11", optional = true }
clap = { version = "4", features = ["derive"], optional = true }
figment = { version = "0.10", features = ["env", "toml"], optional = true }
libc = { version = "0.2", optional = true }
tracing-subscriber = { workspace = true, optional = true }
nvtx = { version = "1.3", optional = true }
[features]
default = ["s3"]
s3 = ["dep:aws-sdk-s3", "dep:aws-config", "dep:rayon", "dep:tokio-rayon", "dep:chrono"]
collectives = ["nccl"]
nccl = ["dep:cudarc"]
testing-nccl = ["collectives"]
nats = ["dep:async-nats", "dep:flume"]
testing = ["kvbm-logical/testing", "kvbm-physical/testing"]
testing-s3 = ["s3", "testing"]
nvtx = ["kvbm-config/nvtx", "dep:nvtx"]
bench = ["dep:clap", "dep:figment", "dep:libc", "dep:tracing-subscriber", "dep:chrono", "testing"]
[[bin]]
name = "bench_engine"
path = "bin/bench_engine.rs"
required-features = ["bench"]
[package.metadata.cargo-machete]
ignored = ["rayon"]
# kvbm-engine
Distributed coordination primitives for KV cache block management (KVBM).
This crate implements the leader/worker architecture for managing KV cache blocks across a tiered storage hierarchy:
**G1** (GPU HBM) → **G2** (Pinned DRAM) → **G3** (NVMe/SSD) → **G4** (S3/MinIO)
Leaders own block metadata and make placement decisions. Workers execute data transfers (RDMA, NVMe, object storage). Sessions coordinate multi-instance block transfers.
## Feature Flags
| Flag | Purpose |
| -------------- | ---------------------------------------- |
| `s3` (default) | S3/MinIO object storage (G4 tier) |
| `testing` | Test utilities and mock infrastructure |
| `nats` | NATS-based pub/sub transport |
| `collectives` | NIXL + NCCL multi-GPU collectives |
| `nccl` | NCCL via cudarc |
| `nvtx` | NVIDIA Tools Extension profiling markers |
## Documentation
Detailed module documentation lives in `[docs/](docs/)`:
- [Architecture](docs/architecture.md) — Overall system design
- [Leader](docs/leader.md) — Block coordination and metadata management
- [Session](docs/session.md) — Distributed onboarding protocol
- [Worker](docs/worker.md) — Transfer execution
- [Worker Group](docs/worker-group.md) — SPMD parallel workers
- [Offload](docs/offload.md) — Async tier-demotion pipeline
- [Offload Developer Guide](docs/offload-developer.md) — Contributing to the offload module
- [Object Storage](docs/object.md) — S3/MinIO integration
- [Runtime](docs/runtime.md) — Runtime bundle (tokio, Velo, NIXL)
- [Testing](docs/testing.md) — Test utilities and fixtures
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment