Unverified Commit 008683d6 authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: adding kvbm-engine (#6773)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent cf79c4fc
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! KVBM transfer bandwidth benchmark with full Leader+Worker architecture.
//!
//! Uses production-fidelity InstanceLeader, VeloWorkerService/Client, SpmdParallelWorkers,
//! and optionally OffloadEngine pipelines. Each worker runs on a NUMA-pinned thread with
//! its own tokio runtime and NixlAgent.
//!
//! # Usage
//! ```bash
//! # Direct transfer benchmark:
//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
//! --devices 0 --page-sizes 32,64 --concurrency 1,2 --iterations 10 --skip-disk --skip-gds
//!
//! # With offload pipeline:
//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
//! --devices 0 --page-sizes 64 --concurrency 1 --iterations 10 --skip-disk --skip-gds \
//! --offload --offload-batch-sizes 8,32 --offload-concurrency 1,2
//!
//! # Multi-GPU:
//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
//! --devices 0,1 --page-sizes 128 --concurrency 1,2,4 --iterations 50
//! ```
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, Instant};
use anyhow::{Result, ensure};
use clap::Parser;
use figment::Figment;
use figment::providers::{Env, Format, Serialized, Toml};
use serde::{Deserialize, Serialize};
use kvbm_engine::{
BlockId, G1, G2, G3, LogicalLayoutHandle,
leader::InstanceLeader,
offload::{ExternalBlock, OffloadEngine, PipelineBuilder, PresenceFilter, SourceBlocks},
testing::{
TestManagerBuilder, TestRegistryBuilder, create_messenger_tcp,
managers::populate_manager_with_blocks, token_blocks,
},
worker::{DirectWorker, Worker, WorkerTransfers},
};
use kvbm_logical::blocks::BlockRegistry;
use kvbm_logical::manager::BlockManager;
use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
use kvbm_physical::transfer::{NixlAgent, TransferManager, TransferOptions};
// ─── CLI ───────────────────────────────────────────────────────────────────────
#[derive(Parser)]
#[command(
name = "bench_engine",
about = "KVBM transfer bandwidth benchmark (leader+worker architecture)"
)]
struct Cli {
/// GPU device IDs (comma-separated)
#[arg(long, value_delimiter = ',', default_value = "0")]
devices: Vec<u32>,
/// Tokens-per-block values to sweep
#[arg(long, value_delimiter = ',', default_values_t = vec![32, 64, 128, 256])]
page_sizes: Vec<usize>,
/// Concurrency levels to sweep
#[arg(long, value_delimiter = ',', default_values_t = vec![1, 2, 4, 8])]
concurrency: Vec<usize>,
/// Blocks per transfer batch
#[arg(long, default_value_t = 8)]
blocks_per_batch: usize,
/// Total blocks per pool (must be >= max_concurrency * blocks_per_batch * 2)
#[arg(long, default_value_t = 128)]
num_blocks: usize,
/// Number of KV-cache layers
#[arg(long, default_value_t = 24)]
num_layers: usize,
/// Inner dimension (hidden_dim / tp_size)
#[arg(long, default_value_t = 4096)]
inner_dim: usize,
/// Bounce buffer block counts to sweep (tail blocks of G2 used as bounce for staged G1↔G3)
#[arg(long, value_delimiter = ',', default_values_t = vec![2, 4, 8])]
bounce_blocks: Vec<usize>,
/// Warmup iterations
#[arg(long, default_value_t = 5)]
warmup: usize,
/// Measurement iterations per test
#[arg(long, default_value_t = 50)]
iterations: usize,
/// Disk path for G3 layouts (default: tempdir)
#[arg(long)]
disk_path: Option<PathBuf>,
/// Skip G3/disk tests
#[arg(long)]
skip_disk: bool,
/// Skip GDS tests
#[arg(long)]
skip_gds: bool,
/// Run only isolated (phase 1) tests
#[arg(long)]
isolated_only: bool,
/// Run only bidirectional (phase 2) tests
#[arg(long)]
bidir_only: bool,
/// Enable offload pipeline benchmarks (phase 3)
#[arg(long)]
offload: bool,
/// Offload pipeline batch sizes to sweep
#[arg(long, value_delimiter = ',', default_values_t = vec![8, 16, 32, 64])]
offload_batch_sizes: Vec<usize>,
/// Max concurrent transfers for offload pipeline
#[arg(long, value_delimiter = ',', default_values_t = vec![1, 2, 4])]
offload_concurrency: Vec<usize>,
/// Base directory for output (default: current directory)
#[arg(long, short)]
output: Option<PathBuf>,
/// Optional TOML config file (overridden by CLI args)
#[arg(long)]
config: Option<PathBuf>,
}
// ─── Config ────────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
struct BenchConfig {
devices: Vec<u32>,
page_sizes: Vec<usize>,
concurrency: Vec<usize>,
blocks_per_batch: usize,
num_blocks: usize,
num_layers: usize,
inner_dim: usize,
bounce_blocks: Vec<usize>,
warmup: usize,
iterations: usize,
disk_path: Option<PathBuf>,
skip_disk: bool,
skip_gds: bool,
isolated_only: bool,
bidir_only: bool,
offload: bool,
offload_batch_sizes: Vec<usize>,
offload_concurrency: Vec<usize>,
output: Option<PathBuf>,
}
impl From<Cli> for BenchConfig {
fn from(cli: Cli) -> Self {
Self {
devices: cli.devices,
page_sizes: cli.page_sizes,
concurrency: cli.concurrency,
blocks_per_batch: cli.blocks_per_batch,
num_blocks: cli.num_blocks,
num_layers: cli.num_layers,
inner_dim: cli.inner_dim,
bounce_blocks: cli.bounce_blocks,
warmup: cli.warmup,
iterations: cli.iterations,
disk_path: cli.disk_path,
skip_disk: cli.skip_disk,
skip_gds: cli.skip_gds,
isolated_only: cli.isolated_only,
bidir_only: cli.bidir_only,
offload: cli.offload,
offload_batch_sizes: cli.offload_batch_sizes,
offload_concurrency: cli.offload_concurrency,
output: cli.output,
}
}
}
fn build_config(cli: Cli) -> Result<BenchConfig> {
let cli_config = BenchConfig::from(cli);
// Check for TOML config file from environment
let config_path: Option<PathBuf> = std::env::var("KVBM_BENCH_CONFIG").ok().map(PathBuf::from);
let mut figment = Figment::new().merge(Serialized::defaults(&cli_config));
if let Some(path) = config_path {
figment = figment.merge(Toml::file(path));
}
figment = figment
.merge(Env::prefixed("KVBM_BENCH_"))
.merge(Serialized::defaults(&cli_config)); // CLI wins
Ok(figment.extract()?)
}
// ─── Results ───────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize)]
struct LatencyStats {
min_us: f64,
max_us: f64,
mean_us: f64,
p50_us: f64,
p95_us: f64,
p99_us: f64,
}
impl LatencyStats {
fn from_durations(mut durations: Vec<Duration>) -> Self {
durations.sort();
let n = durations.len();
let sum: Duration = durations.iter().sum();
Self {
min_us: durations[0].as_secs_f64() * 1e6,
max_us: durations[n - 1].as_secs_f64() * 1e6,
mean_us: sum.as_secs_f64() * 1e6 / n as f64,
p50_us: durations[n / 2].as_secs_f64() * 1e6,
p95_us: durations[(n as f64 * 0.95) as usize].as_secs_f64() * 1e6,
p99_us: durations[(n as f64 * 0.99) as usize].as_secs_f64() * 1e6,
}
}
}
#[derive(Debug, Clone, Serialize)]
struct BenchResult {
test: String,
device_id: u32,
page_size: usize,
blocks_per_batch: usize,
concurrency: usize,
#[serde(skip_serializing_if = "Option::is_none")]
bounce_blocks: Option<usize>,
bytes_per_iter: usize,
iterations: usize,
latency_us: LatencyStats,
bandwidth_gbs: f64,
aggregate_bandwidth_gbs: f64,
}
fn compute_bytes_per_block(config: &BenchConfig, page_size: usize) -> usize {
config.num_layers * 2 * page_size * config.inner_dim * 2
}
fn make_result(
test: &str,
device_id: u32,
page_size: usize,
concurrency: usize,
bounce_blocks: Option<usize>,
config: &BenchConfig,
latencies: Vec<Duration>,
) -> BenchResult {
let bytes_per_block = compute_bytes_per_block(config, page_size);
let bytes_per_iter = bytes_per_block * config.blocks_per_batch * concurrency;
let stats = LatencyStats::from_durations(latencies);
let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3); // bytes / ns = GB/s
let num_devices = config.devices.len();
let aggregate_bandwidth_gbs = bandwidth_gbs * num_devices as f64;
BenchResult {
test: test.to_string(),
device_id,
page_size,
blocks_per_batch: config.blocks_per_batch,
concurrency,
bounce_blocks,
bytes_per_iter,
iterations: config.iterations,
latency_us: stats,
bandwidth_gbs,
aggregate_bandwidth_gbs,
}
}
fn print_result_stderr(r: &BenchResult) {
eprintln!(
"[GPU {}] {} | page={} conc={}{} | {:.1} GB/s (per-dev) {:.1} GB/s (agg) | p50={:.0}us p99={:.0}us",
r.device_id,
r.test,
r.page_size,
r.concurrency,
r.bounce_blocks
.map(|b| format!(" bounce={b}"))
.unwrap_or_default(),
r.bandwidth_gbs,
r.aggregate_bandwidth_gbs,
r.latency_us.p50_us,
r.latency_us.p99_us,
);
}
// ─── Worker Thread Infrastructure ──────────────────────────────────────────────
struct WorkerHandle {
worker: Arc<DirectWorker>,
join_handle: std::thread::JoinHandle<()>,
shutdown_tx: tokio::sync::oneshot::Sender<()>,
}
/// Spawn a long-lived worker thread for a single GPU device.
///
/// The worker thread:
/// 1. Pins to the device's NUMA node
/// 2. Creates its own tokio runtime (2 worker threads)
/// 3. Creates NixlAgent, TransferManager
/// 4. Creates G1/G2/G3 PhysicalLayouts and registers them (NUMA-local allocations)
/// 5. Builds DirectWorker and sends Arc back to leader
/// 6. Waits on shutdown signal (keeps runtime alive for transfers)
fn spawn_worker_thread(
device_id: u32,
page_size: usize,
config: &BenchConfig,
) -> Result<WorkerHandle> {
let (ready_tx, ready_rx) = std::sync::mpsc::channel();
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
let num_blocks = config.num_blocks;
let num_layers = config.num_layers;
let inner_dim = config.inner_dim;
let skip_disk = config.skip_disk;
let skip_gds = config.skip_gds;
let disk_path = config.disk_path.clone();
let join_handle = std::thread::Builder::new()
.name(format!("bench-gpu-{device_id}"))
.spawn(move || {
// Pin to device's NUMA node
if let Some(cpus) = dynamo_memory::numa::get_device_cpu_set(device_id) {
eprintln!(
"[GPU {device_id}] Worker pinned to CPUs: {}",
format_cpu_set(&cpus)
);
pin_thread_to_cpus(&cpus);
} else {
if let Some(node) = dynamo_memory::numa::get_device_numa_node(device_id) {
eprintln!("[GPU {device_id}] Worker pinned to NUMA node {node}");
let _ = dynamo_memory::numa::pin_thread_to_numa_node(node);
} else {
eprintln!("[GPU {device_id}] No NUMA pinning (node unknown)");
}
}
// Build tokio runtime on this NUMA-pinned thread
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(2)
.enable_all()
.thread_name(format!("bench-gpu-{device_id}-tokio"))
.build()
.expect("failed to build tokio runtime");
let result = rt.block_on(async {
// Create a local EventManager for this worker's transfer notifications
let event_system = Arc::new(velo::EventManager::local());
// Create NixlAgent with available backends
let agent_name = format!("bench-gpu-{device_id}");
let mut agent = NixlAgent::new(&agent_name)?;
if !skip_disk && agent.add_backend("POSIX").is_err() {
eprintln!("[GPU {device_id}] POSIX backend unavailable");
}
if !skip_gds && !skip_disk && agent.add_backend("GDS_MT").is_err() {
eprintln!("[GPU {device_id}] GDS_MT backend unavailable");
}
// Create TransferManager
let manager = TransferManager::builder()
.event_system(event_system)
.nixl_agent(agent.clone())
.cuda_device_id(device_id as usize)
.build()?;
// Build layout config
let layout_config = LayoutConfig::builder()
.num_blocks(num_blocks)
.num_layers(num_layers)
.outer_dim(2) // K + V
.page_size(page_size)
.inner_dim(inner_dim)
.dtype_width_bytes(2) // fp16
.build()?;
// Allocate G1 (GPU device memory) — NUMA-local allocation
let g1 = PhysicalLayout::builder(agent.clone())
.with_config(layout_config.clone())
.fully_contiguous()
.allocate_device(device_id)
.build()?;
let g1_handle = manager.register_layout(g1)?;
// Allocate G2 (pinned host memory) — NUMA-local allocation
let g2 = PhysicalLayout::builder(agent.clone())
.with_config(layout_config.clone())
.fully_contiguous()
.allocate_pinned(Some(device_id))
.build()?;
let g2_handle = manager.register_layout(g2)?;
// Allocate G3 (disk) if enabled
let g3_handle = if !skip_disk {
let g3 = PhysicalLayout::builder(agent.clone())
.with_config(layout_config)
.fully_contiguous()
.allocate_disk(disk_path)
.build()?;
Some(manager.register_layout(g3)?)
} else {
None
};
// Build DirectWorker (PhysicalWorker)
let mut worker_builder = DirectWorker::builder()
.manager(manager)
.g1_handle(g1_handle)
.g2_handle(g2_handle);
if let Some(g3) = g3_handle {
worker_builder = worker_builder.g3_handle(g3);
}
let worker = Arc::new(worker_builder.build()?);
ready_tx.send(Ok(worker.clone())).ok();
// Keep runtime alive so TransferManager notification threads stay running
let _ = shutdown_rx.await;
Ok::<(), anyhow::Error>(())
});
if let Err(e) = result {
ready_tx.send(Err(e)).ok();
}
})
.expect("failed to spawn worker thread");
// Wait for worker to be ready
let worker = ready_rx
.recv()
.map_err(|_| anyhow::anyhow!("Worker thread died before sending ready signal"))??;
Ok(WorkerHandle {
worker,
join_handle,
shutdown_tx,
})
}
// ─── BenchInstance: Full Leader+Worker Setup ───────────────────────────────────
struct BenchInstance {
leader: InstanceLeader,
#[allow(dead_code)]
registry: BlockRegistry,
g2_manager: Arc<BlockManager<G2>>,
#[allow(dead_code)]
g3_manager: Option<Arc<BlockManager<G3>>>,
offload_engine: Option<OffloadEngine>,
worker_handles: Vec<WorkerHandle>,
config: BenchConfig,
page_size: usize,
}
impl BenchInstance {
/// Create a full leader+worker bench instance for a given page_size.
///
/// One leader with SpmdParallelWorkers, N DirectWorkers (one per GPU device),
/// each on a NUMA-pinned thread with its own tokio runtime and TransferManager.
async fn new(config: BenchConfig, page_size: usize) -> Result<Self> {
let num_devices = config.devices.len();
eprintln!(
"Setting up BenchInstance: page_size={page_size}, {} device(s)",
num_devices
);
// Spawn worker threads (one per device)
let mut worker_handles = Vec::with_capacity(num_devices);
for &device_id in &config.devices {
let handle = spawn_worker_thread(device_id, page_size, &config)?;
eprintln!("[GPU {device_id}] Worker ready");
worker_handles.push(handle);
}
// Collect DirectWorker references for the leader
let worker_refs: Vec<Arc<dyn Worker>> = worker_handles
.iter()
.map(|wh| wh.worker.clone() as Arc<dyn Worker>)
.collect();
// Create leader Messenger (needed by InstanceLeader for event system)
let leader_messenger = create_messenger_tcp().await?;
// Build BlockRegistry and BlockManagers
let registry = TestRegistryBuilder::new().build();
let g2_manager = Arc::new(
TestManagerBuilder::<G2>::new()
.block_count(config.num_blocks)
.block_size(page_size)
.registry(registry.clone())
.build(),
);
let g3_manager = if !config.skip_disk {
Some(Arc::new(
TestManagerBuilder::<G3>::new()
.block_count(config.num_blocks)
.block_size(page_size)
.registry(registry.clone())
.build(),
))
} else {
None
};
// Build InstanceLeader with direct worker references
let mut leader_builder = InstanceLeader::builder()
.messenger(leader_messenger)
.registry(registry.clone())
.g2_manager(g2_manager.clone())
.workers(worker_refs);
if let Some(ref g3m) = g3_manager {
leader_builder = leader_builder.g3_manager(g3m.clone());
}
let leader = leader_builder.build()?;
// Build OffloadEngine if requested
let offload_engine = if config.offload {
let mut engine_builder = OffloadEngine::builder(Arc::new(leader.clone()))
.with_registry(Arc::new(registry.clone()))
.with_g2_manager(g2_manager.clone())
.with_runtime(tokio::runtime::Handle::current());
if let Some(ref g3m) = g3_manager {
engine_builder = engine_builder.with_g3_manager(g3m.clone());
}
// Configure G1→G2 pipeline with a pass-through presence filter
let g1_to_g2_config = PipelineBuilder::<G1, G2>::new()
.policy(Arc::new(PresenceFilter::<G1, G2>::new(Arc::new(
registry.clone(),
))))
.batch_size(64)
.max_concurrent_transfers(4)
.build();
engine_builder = engine_builder.with_g1_to_g2_pipeline(g1_to_g2_config);
// Configure G2→G3 pipeline if disk enabled
if g3_manager.is_some() {
let g2_to_g3_config = PipelineBuilder::<G2, G3>::new()
.policy(Arc::new(PresenceFilter::<G2, G3>::new(Arc::new(
registry.clone(),
))))
.batch_size(64)
.max_concurrent_transfers(4)
.build();
engine_builder = engine_builder.with_g2_to_g3_pipeline(g2_to_g3_config);
}
Some(engine_builder.build()?)
} else {
None
};
Ok(Self {
leader,
registry,
g2_manager,
g3_manager,
offload_engine,
worker_handles,
config,
page_size,
})
}
/// Run all benchmark phases and return results.
async fn run_benchmarks(&self) -> Result<Vec<BenchResult>> {
let mut results = Vec::new();
if !self.config.bidir_only {
eprintln!(
"=== Phase 1: Isolated Transfers (page_size={}) ===",
self.page_size
);
results.extend(self.bench_isolated_transfers().await?);
}
if !self.config.isolated_only {
eprintln!(
"=== Phase 2: Bidirectional Contention (page_size={}) ===",
self.page_size
);
results.extend(self.bench_bidir_transfers().await?);
}
if self.config.offload && self.offload_engine.is_some() {
eprintln!(
"=== Phase 3: Offload Pipeline (page_size={}) ===",
self.page_size
);
results.extend(self.bench_offload_pipeline().await?);
}
Ok(results)
}
// ─── Phase 1: Isolated Transfers ───────────────────────────────────────
async fn bench_isolated_transfers(&self) -> Result<Vec<BenchResult>> {
let mut results = Vec::new();
let device_id = self.config.devices[0]; // Report results under first device
let parallel_worker = self
.leader
.parallel_worker()
.ok_or_else(|| anyhow::anyhow!("No parallel worker available"))?;
for &conc in &self.config.concurrency {
let bpb = self.config.blocks_per_batch;
let block_ids: Arc<[BlockId]> =
Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
// G1→G2 (D2H offload)
let latencies = self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G1,
LogicalLayoutHandle::G2,
block_ids.clone(),
block_ids.clone(),
)
.await?;
let r = make_result(
"g1_to_g2",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
// G2→G1 (H2D onboard)
let latencies = self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G2,
LogicalLayoutHandle::G1,
block_ids.clone(),
block_ids.clone(),
)
.await?;
let r = make_result(
"g2_to_g1",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
// G2↔G3 tests (if disk enabled)
if !self.config.skip_disk {
// G2→G3
let latencies = self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G2,
LogicalLayoutHandle::G3,
block_ids.clone(),
block_ids.clone(),
)
.await?;
let r = make_result(
"g2_to_g3",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
// G3→G2
let latencies = self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G3,
LogicalLayoutHandle::G2,
block_ids.clone(),
block_ids.clone(),
)
.await?;
let r = make_result(
"g3_to_g2",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
}
}
// G1↔G3 direct tests (GDS or bounce-buffer-free path)
if !self.config.skip_disk {
// GDS direct tests (G1↔G3 without bounce)
if !self.config.skip_gds {
for &conc in &self.config.concurrency {
let bpb = self.config.blocks_per_batch;
let block_ids: Arc<[BlockId]> =
Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
// G1→G3 direct (GDS)
match self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G1,
LogicalLayoutHandle::G3,
block_ids.clone(),
block_ids.clone(),
)
.await
{
Ok(latencies) => {
let r = make_result(
"g1_to_g3_gds",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
}
Err(e) => {
eprintln!("GDS g1_to_g3 failed (GDS may not be available): {e}");
}
}
// G3→G1 direct (GDS)
match self
.bench_transfer(
&*parallel_worker,
LogicalLayoutHandle::G3,
LogicalLayoutHandle::G1,
block_ids.clone(),
block_ids.clone(),
)
.await
{
Ok(latencies) => {
let r = make_result(
"g3_to_g1_gds",
device_id,
self.page_size,
conc,
None,
&self.config,
latencies,
);
print_result_stderr(&r);
results.push(r);
}
Err(e) => {
eprintln!("GDS g3_to_g1 failed (GDS may not be available): {e}");
}
}
}
}
}
Ok(results)
}
// ─── Phase 2: Bidirectional Contention ─────────────────────────────────
async fn bench_bidir_transfers(&self) -> Result<Vec<BenchResult>> {
let mut results = Vec::new();
let device_id = self.config.devices[0];
let parallel_worker = self
.leader
.parallel_worker()
.ok_or_else(|| anyhow::anyhow!("No parallel worker available"))?;
let bidir_concurrencies: Vec<usize> = self
.config
.concurrency
.iter()
.copied()
.filter(|&c| c <= 4)
.collect();
for &conc in &bidir_concurrencies {
let bpb = self.config.blocks_per_batch;
let total_blocks_needed = 2 * conc * bpb;
if total_blocks_needed > self.config.num_blocks {
eprintln!(
"Skipping bidir page_size={} conc={conc}: need {total_blocks_needed} blocks but only have {}",
self.page_size, self.config.num_blocks
);
continue;
}
// D2H block range: [0..conc*bpb)
let d2h_ids: Arc<[BlockId]> =
Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
// H2D block range: [conc*bpb..2*conc*bpb)
let h2d_ids: Arc<[BlockId]> = Arc::from(
(conc * bpb..2 * conc * bpb)
.map(|i| i as BlockId)
.collect::<Vec<_>>(),
);
// Warmup
for _ in 0..self.config.warmup {
let d2h_notif = parallel_worker.execute_local_transfer(
LogicalLayoutHandle::G1,
LogicalLayoutHandle::G2,
d2h_ids.clone(),
d2h_ids.clone(),
TransferOptions::default(),
)?;
let h2d_notif = parallel_worker.execute_local_transfer(
LogicalLayoutHandle::G2,
LogicalLayoutHandle::G1,
h2d_ids.clone(),
h2d_ids.clone(),
TransferOptions::default(),
)?;
d2h_notif.await?;
h2d_notif.await?;
}
// Measure
let mut d2h_latencies = Vec::with_capacity(self.config.iterations);
let mut h2d_latencies = Vec::with_capacity(self.config.iterations);
for _ in 0..self.config.iterations {
let start = Instant::now();
let d2h_notif = parallel_worker.execute_local_transfer(
LogicalLayoutHandle::G1,
LogicalLayoutHandle::G2,
d2h_ids.clone(),
d2h_ids.clone(),
TransferOptions::default(),
)?;
let h2d_notif = parallel_worker.execute_local_transfer(
LogicalLayoutHandle::G2,
LogicalLayoutHandle::G1,
h2d_ids.clone(),
h2d_ids.clone(),
TransferOptions::default(),
)?;
d2h_notif.await?;
let d2h_elapsed = start.elapsed();
h2d_notif.await?;
let h2d_elapsed = start.elapsed();
d2h_latencies.push(d2h_elapsed);
h2d_latencies.push(h2d_elapsed);
}
let r = make_result(
"bidir_g1_to_g2",
device_id,
self.page_size,
conc,
None,
&self.config,
d2h_latencies,
);
print_result_stderr(&r);
results.push(r);
let r = make_result(
"bidir_g2_to_g1",
device_id,
self.page_size,
conc,
None,
&self.config,
h2d_latencies,
);
print_result_stderr(&r);
results.push(r);
}
Ok(results)
}
// ─── Phase 3: Offload Pipeline ─────────────────────────────────────────
async fn bench_offload_pipeline(&self) -> Result<Vec<BenchResult>> {
let mut results = Vec::new();
let device_id = self.config.devices[0];
let engine = self
.offload_engine
.as_ref()
.ok_or_else(|| anyhow::anyhow!("OffloadEngine not configured"))?;
// Populate G2 manager with test blocks so the registry has entries
let token_seq =
token_blocks::create_token_sequence(self.config.num_blocks, self.page_size, 0);
let seq_hashes = populate_manager_with_blocks(&self.g2_manager, token_seq.blocks())?;
for &batch_size in &self.config.offload_batch_sizes {
if batch_size > self.config.num_blocks {
eprintln!(
"Skipping offload batch_size={batch_size}: exceeds num_blocks={}",
self.config.num_blocks
);
continue;
}
for &conc in &self.config.offload_concurrency {
eprintln!("Offload G1→G2 pipeline: batch_size={batch_size} concurrency={conc}");
// Warmup
for _ in 0..self.config.warmup {
let blocks: Vec<ExternalBlock<G1>> = (0..batch_size)
.map(|i| ExternalBlock::new(i as BlockId, seq_hashes[i]))
.collect();
let mut handle = engine.enqueue_g1_to_g2(SourceBlocks::External(blocks))?;
handle.wait().await?;
}
// Measure
let mut latencies = Vec::with_capacity(self.config.iterations);
for _ in 0..self.config.iterations {
let blocks: Vec<ExternalBlock<G1>> = (0..batch_size)
.map(|i| ExternalBlock::new(i as BlockId, seq_hashes[i]))
.collect();
let start = Instant::now();
let mut handle = engine.enqueue_g1_to_g2(SourceBlocks::External(blocks))?;
handle.wait().await?;
latencies.push(start.elapsed());
}
let bytes_per_block = compute_bytes_per_block(&self.config, self.page_size);
let bytes_per_iter = bytes_per_block * batch_size;
let stats = LatencyStats::from_durations(latencies);
let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3);
let num_devices = self.config.devices.len();
let r = BenchResult {
test: "offload_g1_to_g2_pipeline".to_string(),
device_id,
page_size: self.page_size,
blocks_per_batch: batch_size,
concurrency: conc,
bounce_blocks: None,
bytes_per_iter,
iterations: self.config.iterations,
latency_us: stats,
bandwidth_gbs,
aggregate_bandwidth_gbs: bandwidth_gbs * num_devices as f64,
};
print_result_stderr(&r);
results.push(r);
}
}
// G2→G3 pipeline if disk enabled
if !self.config.skip_disk && engine.has_g2_to_g3() {
for &batch_size in &self.config.offload_batch_sizes {
if batch_size > self.config.num_blocks {
continue;
}
for &conc in &self.config.offload_concurrency {
eprintln!("Offload G2→G3 pipeline: batch_size={batch_size} concurrency={conc}");
// Get immutable blocks from g2_manager for SourceBlocks::Strong
let matched = self.g2_manager.match_blocks(&seq_hashes[..batch_size]);
// Warmup
for _ in 0..self.config.warmup {
let mut handle =
engine.enqueue_g2_to_g3(SourceBlocks::Strong(matched.clone()))?;
handle.wait().await?;
}
// Measure
let mut latencies = Vec::with_capacity(self.config.iterations);
for _ in 0..self.config.iterations {
let start = Instant::now();
let mut handle =
engine.enqueue_g2_to_g3(SourceBlocks::Strong(matched.clone()))?;
handle.wait().await?;
latencies.push(start.elapsed());
}
let bytes_per_block = compute_bytes_per_block(&self.config, self.page_size);
let bytes_per_iter = bytes_per_block * batch_size;
let stats = LatencyStats::from_durations(latencies);
let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3);
let num_devices = self.config.devices.len();
let r = BenchResult {
test: "offload_g2_to_g3_pipeline".to_string(),
device_id,
page_size: self.page_size,
blocks_per_batch: batch_size,
concurrency: conc,
bounce_blocks: None,
bytes_per_iter,
iterations: self.config.iterations,
latency_us: stats,
bandwidth_gbs,
aggregate_bandwidth_gbs: bandwidth_gbs * num_devices as f64,
};
print_result_stderr(&r);
results.push(r);
}
}
}
Ok(results)
}
// ─── Transfer Helpers ──────────────────────────────────────────────────
/// Benchmark a single transfer direction via the parallel worker (SPMD).
async fn bench_transfer(
&self,
parallel_worker: &dyn WorkerTransfers,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: Arc<[BlockId]>,
dst_block_ids: Arc<[BlockId]>,
) -> Result<Vec<Duration>> {
self.bench_transfer_with_options(
parallel_worker,
src,
dst,
src_block_ids,
dst_block_ids,
TransferOptions::default(),
)
.await
}
/// Benchmark a transfer with custom TransferOptions (e.g., bounce buffer).
async fn bench_transfer_with_options(
&self,
parallel_worker: &dyn WorkerTransfers,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: Arc<[BlockId]>,
dst_block_ids: Arc<[BlockId]>,
options: TransferOptions,
) -> Result<Vec<Duration>> {
// Warmup
for _ in 0..self.config.warmup {
let notif = parallel_worker.execute_local_transfer(
src,
dst,
src_block_ids.clone(),
dst_block_ids.clone(),
options.clone(),
)?;
notif.await?;
}
// Measure
let mut latencies = Vec::with_capacity(self.config.iterations);
for _ in 0..self.config.iterations {
let start = Instant::now();
let notif = parallel_worker.execute_local_transfer(
src,
dst,
src_block_ids.clone(),
dst_block_ids.clone(),
options.clone(),
)?;
notif.await?;
latencies.push(start.elapsed());
}
Ok(latencies)
}
/// Shutdown all workers.
fn shutdown(self) {
for handle in self.worker_handles {
handle.shutdown_tx.send(()).ok();
handle.join_handle.join().ok();
}
}
}
// ─── NUMA Pinning Helpers ──────────────────────────────────────────────────────
fn pin_thread_to_cpus(cpus: &[usize]) {
unsafe {
let mut cpu_set: libc::cpu_set_t = std::mem::zeroed();
for &cpu in cpus {
libc::CPU_SET(cpu, &mut cpu_set);
}
libc::sched_setaffinity(0, std::mem::size_of::<libc::cpu_set_t>(), &cpu_set);
}
}
fn format_cpu_set(cpus: &[usize]) -> String {
if cpus.is_empty() {
return String::new();
}
// Compress into ranges: [0,1,2,3,8,9,10] -> "0-3,8-10"
let mut parts = Vec::new();
let mut start = cpus[0];
let mut end = cpus[0];
for &cpu in &cpus[1..] {
if cpu == end + 1 {
end = cpu;
} else {
if start == end {
parts.push(format!("{start}"));
} else {
parts.push(format!("{start}-{end}"));
}
start = cpu;
end = cpu;
}
}
if start == end {
parts.push(format!("{start}"));
} else {
parts.push(format!("{start}-{end}"));
}
parts.join(",")
}
// ─── Validation ────────────────────────────────────────────────────────────────
fn validate_config(config: &BenchConfig) -> Result<()> {
let max_conc = config.concurrency.iter().max().copied().unwrap_or(1);
let max_bounce = config.bounce_blocks.iter().max().copied().unwrap_or(0);
// For bidir tests we need 2x the blocks (separate ranges for each direction)
let multiplier = if config.isolated_only { 1 } else { 2 };
let transfer_blocks = max_conc * config.blocks_per_batch * multiplier;
// Bounce blocks come from the tail of G2, so they must not overlap with
// the transfer block range [0..transfer_blocks).
let min_blocks = transfer_blocks + max_bounce;
ensure!(
config.num_blocks >= min_blocks,
"num_blocks ({}) must be >= max_concurrency ({}) * blocks_per_batch ({}) * {} + max_bounce ({}) = {}",
config.num_blocks,
max_conc,
config.blocks_per_batch,
multiplier,
max_bounce,
min_blocks,
);
ensure!(
!config.devices.is_empty(),
"must specify at least one device"
);
ensure!(
!config.page_sizes.is_empty(),
"must specify at least one page_size"
);
ensure!(
!config.concurrency.is_empty(),
"must specify at least one concurrency level"
);
ensure!(config.iterations > 0, "iterations must be > 0");
// Validate disk path if G3 tests enabled
if let Some(ref path) = config.disk_path
&& !config.skip_disk
{
ensure!(
path.exists() || path.parent().is_some_and(|p| p.exists()),
"disk path {} does not exist",
path.display()
);
}
// Validate offload config
if config.offload {
ensure!(
!config.offload_batch_sizes.is_empty(),
"offload enabled but no batch sizes specified"
);
ensure!(
!config.offload_concurrency.is_empty(),
"offload enabled but no concurrency levels specified"
);
}
Ok(())
}
// ─── Main ──────────────────────────────────────────────────────────────────────
fn main() -> Result<()> {
// Initialize tracing for debug output
tracing_subscriber_init();
let cli = Cli::parse();
let config = build_config(cli)?;
validate_config(&config)?;
eprintln!("KVBM Engine Benchmark (Leader+Worker Architecture)");
eprintln!(" Devices: {:?}", config.devices);
eprintln!(" Page sizes: {:?}", config.page_sizes);
eprintln!(" Concurrency: {:?}", config.concurrency);
eprintln!(" Blocks per batch: {}", config.blocks_per_batch);
eprintln!(" Total blocks per pool: {}", config.num_blocks);
eprintln!(
" Layers: {}, Inner dim: {}",
config.num_layers, config.inner_dim
);
eprintln!(
" Warmup: {}, Iterations: {}",
config.warmup, config.iterations
);
eprintln!(
" Disk: {}",
if config.skip_disk {
"disabled"
} else {
"enabled"
}
);
eprintln!(
" GDS: {}",
if config.skip_gds {
"disabled"
} else {
"enabled"
}
);
if config.offload {
eprintln!(" Offload: enabled");
eprintln!(" Batch sizes: {:?}", config.offload_batch_sizes);
eprintln!(" Concurrency: {:?}", config.offload_concurrency);
}
eprintln!();
// Build a main-thread tokio runtime for the leader
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(4)
.enable_all()
.thread_name("bench-leader-tokio")
.build()?;
let all_results = rt.block_on(async {
let mut all_results: Vec<BenchResult> = Vec::new();
// Page-size sweep: rebuild full worker stack per page_size
// (mirrors production where model config determines page_size at startup)
for &page_size in &config.page_sizes {
eprintln!("\n{}", "=".repeat(72));
eprintln!("Page size: {page_size}");
eprintln!("{}", "=".repeat(72));
let instance = BenchInstance::new(config.clone(), page_size).await?;
let results = instance.run_benchmarks().await?;
all_results.extend(results);
instance.shutdown();
}
Ok::<_, anyhow::Error>(all_results)
})?;
// Build timestamped output directory: <cwd>/YYMMDD-HH:MM:SS-bench-engine/
let now = chrono::Local::now();
let dir_name = now.format("%y%m%d-%H:%M:%S-bench-engine").to_string();
let out_dir = if let Some(ref base) = config.output {
base.join(&dir_name)
} else {
PathBuf::from(&dir_name)
};
std::fs::create_dir_all(&out_dir)?;
// Write JSON Lines results
let json_output: String = all_results
.iter()
.map(|r| serde_json::to_string(r).unwrap())
.collect::<Vec<_>>()
.join("\n");
let jsonl_path = out_dir.join(format!("{dir_name}.jsonl"));
std::fs::write(&jsonl_path, &json_output)?;
// Copy the viewer HTML into the output directory
let viewer_html = include_str!("../scripts/bench_viewer.html");
let viewer_path = out_dir.join(format!("{dir_name}.html"));
std::fs::write(&viewer_path, viewer_html)?;
eprintln!(
"\nBenchmark complete. {} results collected.",
all_results.len()
);
eprintln!("Results directory: {}", out_dir.display());
eprintln!(" {}", jsonl_path.display());
eprintln!(" {}", viewer_path.display());
Ok(())
}
fn tracing_subscriber_init() {
use std::env;
if env::var("RUST_LOG").is_err() {
// SAFETY: Called at program start before any threads are spawned.
unsafe { env::set_var("RUST_LOG", "error") };
}
tracing_subscriber::fmt()
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
.with_writer(std::io::stderr)
.init();
}
# kvbm-engine
`kvbm-engine` provides distributed coordination primitives for KV Block Management (KVBM).
It implements a tiered storage model where KV cache blocks flow between GPU memory, host
DRAM, local disk, and object storage. The crate coordinates leaders (which own block
metadata and make placement decisions) with workers (which execute data transfers via
RDMA, NVMe, or object storage APIs).
## Storage Tier Model
| Tier | Medium | Latency | Capacity | Description |
|------|--------|---------|----------|-------------|
| G1 | GPU HBM | ~ns | Smallest | Active KV cache used by attention kernels |
| G2 | Pinned DRAM | ~us | Medium | Staging area for RDMA transfers and tier promotion |
| G3 | NVMe/SSD | ~ms | Large | Persistent warm-block storage |
| G4 | S3/MinIO | ~100ms | Unlimited | Cold/archival object storage |
## Architecture
```text
+-----------------+
| InstanceLeader |
| (find_matches, |
| BlockAccessor)|
+--------+--------+
|
+-------------+-------------+
| |
+--------v--------+ +--------v--------+
| CoordinatedWorker| | CoordinatedWorker|
| (rank 0) | | (rank 1) |
+--------+---------+ +--------+---------+
| |
+--------v--------+ +--------v--------+
| PhysicalWorker | | PhysicalWorker |
| (TransferManager)| | (TransferManager)|
+-----------------+ +-----------------+
```
The leader drives workers through the `ParallelWorkers` trait (`SpmdParallelWorkers`
for SPMD execution). For onboarding, the leader creates sessions that progress through
stages: search, hold, prepare (G3->G2), and pull (remote G2->local G2 via RDMA).
## Modules
| Module | Purpose |
|--------|---------|
| `leader` | Block coordination: matching, onboarding sessions, policy-based scanning |
| `worker` | Transfer execution: local, RDMA, and object storage data movement |
| `object` | G4 storage: S3/MinIO client for cold-tier block persistence |
| `offload` | Tier demotion pipeline: batched G2->G3 and G2->G4 offloading |
| `runtime` | Shared infrastructure: `KvbmRuntime`, tokio handle, NIXL agent |
| `pubsub` | Event pub/sub: block-level notifications for cross-instance coordination |
| `collectives` | NCCL collectives for multi-GPU synchronization (feature-gated) |
| `testing` | Test utilities: mock workers, in-memory block managers (feature-gated) |
## Feature Flags
| Flag | Dependencies | Description |
|------|-------------|-------------|
| `default` | `["s3"]` | Default features |
| `s3` | `aws-sdk-s3`, `aws-config`, `rayon`, `tokio-rayon`, `chrono` | S3/MinIO object storage support |
| `collectives` | `nixl-sys`, `nccl` | NIXL + NCCL multi-GPU collectives |
| `nccl` | `cudarc` | NCCL support via cudarc |
| `testing-nccl` | `collectives` | Enable collectives for tests |
| `nats` | `async-nats`, `flume` | NATS-based pub/sub transport |
| `testing` | `kvbm-logical/testing`, `kvbm-physical/testing` | Test utilities and mock infrastructure |
| `nvtx` | `kvbm-config/nvtx` | NVIDIA Tools Extension profiling markers |
## Quick Start
```rust,ignore
use kvbm_engine::{KvbmRuntime, leader::InstanceLeader};
// Build runtime from environment
let runtime = KvbmRuntime::from_env_leader().await?;
// Create a leader instance
let leader = InstanceLeader::new(/* ... */);
// Search for cached blocks
let result = leader.find_matches(&sequence_hashes)?;
```
# Leader Module
The leader module implements block coordination for a single KVBM instance. It owns
block metadata (via `BlockManager<G2>` and `BlockManager<G3>`), resolves cache lookups,
and orchestrates multi-stage onboarding sessions that move blocks between storage tiers
and across instances.
## Leader Trait
The `Leader` trait defines the core coordination interface:
```rust,ignore
pub trait Leader: Send + Sync {
fn find_matches(&self, sequence_hashes: &[SequenceHash]) -> Result<FindMatchesResult>;
fn find_matches_with_options(
&self, sequence_hashes: &[SequenceHash], options: FindMatchesOptions,
) -> Result<FindMatchesResult>;
}
```
`find_matches` searches for blocks matching the given sequence hashes and returns
either an immediate result or an async session depending on the staging mode and
search scope.
## InstanceLeader
`InstanceLeader` is the primary implementation of `Leader`. It holds:
- `BlockManager<G2>` and optional `BlockManager<G3>` for local block registries
- A `ParallelWorkers` instance for driving transfer execution
- Session state for active onboarding operations
- Remote leader connections for cross-instance coordination
## FindMatchesResult
The result of `find_matches` is one of two variants:
- **`Ready`** -- Returned when `search_remote == false` AND `staging_mode == Hold`.
Blocks are held in place via RAII without creating a session. The `ReadyResult`
directly owns `Vec<ImmutableBlock<G2>>`.
- **`AsyncSession`** -- Returned when remote search or staging is required. Contains
a `SessionId`, a `watch::Receiver<OnboardingStatus>` for progress tracking, and
an optional `SessionHandle` for deferred control.
## StagingMode
Controls how matched blocks are staged and when the session completes:
| Mode | Behavior | Session Lifetime |
|------|----------|-----------------|
| `Hold` | Blocks remain in their current tiers (G2/G3) on original instances | Stays alive for deferred operations |
| `Prepare` | G3->G2 staging on all instances; no RDMA pulls | Stays alive after staging completes |
| `Full` | G3->G2 everywhere, then RDMA pull remote G2->local G2 | Completes when all blocks are in local G2 |
The progression `Hold -> Prepare -> Full` can be driven incrementally via
`SessionHandle::prepare()` and `SessionHandle::pull()`.
## OnboardingStatus State Machine
```text
Searching
|
+---> Holding { local_g2, local_g3, remote_g2, remote_g3, pending_g4, ... }
| |
| +---> (prepare) ---> Preparing { matched, staging_local, staging_remote }
| |
+---> Preparing ------------------>+
| |
| Prepared { local_g2, remote_g2 }
| |
| +---> (pull) ---> Staging { matched, ..., pulling }
| |
+---> Staging ------------------------------------------>+
|
Complete { matched_blocks }
```
Each status variant carries counters for progress tracking and cost analysis.
`Holding` includes G4 load tracking (`pending_g4`, `loaded_g4`, `failed_g4`).
## SessionHandle
`SessionHandle` provides deferred control over `Hold` and `Prepare` sessions:
- `prepare()` -- Trigger G3->G2 staging (Hold -> Prepare transition)
- `pull()` -- Trigger RDMA pull of remote G2->local G2 (Prepare -> Complete)
- `cancel()` -- Cancel session and release all held blocks
Not available for `StagingMode::Full` (which runs to completion automatically).
## BlockAccessor
`BlockAccessor` provides a stateless, `Send + Sync` interface for policy-based
block scanning. Each `find()` call independently searches G2 then G3, acquiring
blocks via RAII. The companion `PolicyContext` adds result collection via
`yield_item()` for streaming scan results back to the caller.
# Object Storage Module
The object module provides traits and implementations for storing KV cache
blocks in object storage systems (S3, MinIO). This corresponds to the G4
(object store) tier in the storage hierarchy.
## ObjectBlockOps Trait
The primary trait for block-level object storage operations:
| Method | Purpose |
|--------|---------|
| `has_blocks(keys)` | Check existence and size of blocks |
| `put_blocks(keys, src_layout, block_ids)` | Upload blocks using logical layout handle |
| `get_blocks(keys, dst_layout, block_ids)` | Download blocks using logical layout handle |
| `put_blocks_with_layout(keys, layout, block_ids)` | Upload using resolved physical layout |
| `get_blocks_with_layout(keys, layout, block_ids)` | Download using resolved physical layout |
### Logical vs Physical Layout
The trait offers two APIs for put/get:
- **Logical** (`put_blocks` / `get_blocks`): Takes a `LogicalLayoutHandle` (G1, G2, G3).
Workers resolve this to their own physical layout internally. Used by the leader
(which doesn't have physical layouts) and by `CoordinatedWorker`.
- **Physical** (`put_blocks_with_layout` / `get_blocks_with_layout`): Takes a resolved
`PhysicalLayout` directly. Used by `PhysicalWorker` after resolving its handles, and
by `S3ObjectBlockClient` which performs the actual I/O.
## Key Formatting
Keys map `SequenceHash` values to object storage paths:
- **`DefaultKeyFormatter`**: Uses the hash's Display representation
(e.g., `0:abc123`). Suitable for single-worker scenarios.
- **`RankPrefixedKeyFormatter`**: Prefixes with worker rank
(e.g., `0/0:abc123`). Required for SPMD workers where multiple workers
store the same logical block with different physical data.
The `create_key_formatter(rank)` factory returns the appropriate formatter.
## ObjectLockManager
Distributed locking protocol for coordinated offloads to prevent duplicate
uploads:
```text
has_meta(hash)
→ true → skip (already offloaded)
→ false → try_acquire_lock(hash)
→ true → transfer → create_meta(hash) → release_lock(hash)
→ false → skip (another instance owns it)
```
Uses conditional PUT (`If-None-Match: *`) for lock acquisition with deadline-based
expiry for stale lock recovery.
## S3 Implementation
The `s3` submodule (feature-gated behind `s3`) provides:
- **`S3ObjectBlockClient`**: Implements `ObjectBlockOps` for S3-compatible storage.
Supports concurrent uploads/downloads via `rayon` thread pool and contiguous
memory fast paths for aligned block data.
- **`S3LockManager`**: Implements `ObjectLockManager` using S3 conditional writes.
## Factory Functions
- **`create_object_client(config, rank)`**: Creates an `Arc<dyn ObjectBlockOps>`
from configuration. Selects the backend (S3 or future alternatives) based on
`ObjectClientConfig`.
- **`create_lock_manager(config, instance_id)`**: Creates an
`Arc<dyn ObjectLockManager>` for distributed lock coordination.
# Offload Module Developer Guide
This document provides implementation details for developers working on the offload pipeline. For high-level concepts and policy statements, see [offload.md](offload.md).
## Container-Based Architecture
### OffloadContainer
The container is the fundamental unit that flows through the pipeline:
```rust,ignore
struct OffloadContainer<T: BlockMetadata> {
/// Source blocks to transfer
blocks: Vec<SourceBlock<T>>,
/// Precondition event - Some before PreconditionAwaiter, None after
precondition: Option<EventHandle>,
/// Cancellation token (cloned from TransferHandle)
cancel_token: CancellationToken,
}
impl<T: BlockMetadata> OffloadContainer<T> {
/// Check if this container has been cancelled
fn is_cancelled(&self) -> bool {
self.cancel_token.is_requested()
}
/// Upgrade all blocks from Weak → Strong
/// Returns None if any block was evicted
fn upgrade(self) -> Option<UpgradedContainer<T>> {
// Implementation upgrades each SourceBlock
}
}
```
### OffloadBatch
Batches group multiple containers for efficient transfer:
```rust,ignore
struct OffloadBatch<T: BlockMetadata> {
containers: Vec<OffloadContainer<T>>,
}
impl<T: BlockMetadata> OffloadBatch<T> {
/// Total blocks across all containers
fn total_blocks(&self) -> usize {
self.containers.iter().map(|c| c.blocks.len()).sum()
}
/// Remove cancelled containers, return count removed
fn sweep_cancelled(&mut self) -> usize {
let before = self.containers.len();
self.containers.retain(|c| !c.is_cancelled());
before - self.containers.len()
}
/// Check if batch is empty
fn is_empty(&self) -> bool {
self.containers.is_empty()
}
}
```
### Data Transformations Per Stage
| Stage | Input | Output | Transform |
|-------|-------|--------|-----------|
| Enqueue | `Vec<SourceBlock<T>>` | `OffloadContainer<T>` | Wrap with token + precondition |
| PolicyEvaluator | `OffloadContainer<T>` | `OffloadContainer<T>` | Filter `blocks` vec |
| PreconditionAwaiter | `OffloadContainer<T>` | `OffloadContainer<T>` | Await event, set `precondition = None` |
| Batcher | `OffloadContainer<T>` | `OffloadBatch<T>` | Group by total block count |
| TransferExecutor | `OffloadBatch<T>` | `Vec<ImmutableBlock<T>>` | Sweep → Upgrade → Flat map |
---
## Token-Based Cancellation
### Token Lifecycle
1. **Creation**: At enqueue, create a `CancellationToken` pair
2. **Distribution**: Handle gets the token, container gets a clone
3. **Propagation**: Token travels with container through pipeline
4. **Termination**: Token is consumed at upgrade (commitment point)
```rust,ignore
// At enqueue
let (cancel_token, cancel_updater) = CancellationToken::new();
// Give to handle
let handle = TransferHandle { cancel_token: cancel_token.clone(), ... };
// Give to container
let container = OffloadContainer {
blocks,
precondition: Some(event),
cancel_token: cancel_token.clone(),
};
```
### CancellationToken API
```rust,ignore
impl CancellationToken {
/// Request cancellation (called by handle)
fn request(&self);
/// Check if cancellation requested
fn is_requested(&self) -> bool;
/// Await cancellation request (for select!)
async fn wait_requested(&self);
/// Await confirmation that all blocks released
fn wait_confirmed(&self) -> CancelConfirmation;
}
```
### PreconditionAwaiter Select Pattern
The awaiter uses `select!` to handle both event completion and cancellation:
```rust,ignore
async fn process(&self, mut container: OffloadContainer<T>) {
// Fast path: event already satisfied
if let Some(ref event) = container.precondition {
if event.is_done() {
container.precondition = None;
self.output_queue.push(container);
return;
}
}
// Slow path: select on event OR cancellation
if let Some(event) = container.precondition.take() {
tokio::select! {
_ = event.wait() => {
// Event satisfied, propagate
self.output_queue.push(container);
}
_ = container.cancel_token.wait_requested() => {
// Cancelled while waiting - drop container
tracing::debug!("Container cancelled during precondition wait");
// container dropped here
}
}
} else {
// No precondition, pass through
self.output_queue.push(container);
}
}
```
### CancellableQueue Sweep Mechanics
The queue supports active cancellation via sweeping:
```rust,ignore
impl<T: HasCancellationToken> CancellableQueue<T> {
/// Push item, reject if already cancelled
fn push(&self, item: T) -> bool {
if item.cancel_token().is_requested() {
return false;
}
self.inner.push(item);
true
}
/// Pop, skipping cancelled items
fn pop_valid(&self) -> Option<T> {
loop {
match self.inner.pop() {
Some(item) if item.cancel_token().is_requested() => continue,
other => return other,
}
}
}
/// Remove all cancelled items
fn sweep(&self) -> usize {
let mut removed = 0;
let mut kept = Vec::new();
while let Some(item) = self.inner.pop() {
if item.cancel_token().is_requested() {
removed += 1;
} else {
kept.push(item);
}
}
for item in kept {
self.inner.push(item);
}
removed
}
}
```
### Batch-Level Sweep
For `CancellableQueue<OffloadBatch<T>>`, sweeping removes cancelled containers within batches:
```rust,ignore
fn sweep(&self) -> usize {
let mut removed_containers = 0;
let mut kept_batches = Vec::new();
while let Some(mut batch) = self.inner.pop() {
// Remove cancelled containers from this batch
removed_containers += batch.sweep_cancelled();
// Keep batch if it still has containers
if !batch.is_empty() {
kept_batches.push(batch);
}
}
for batch in kept_batches {
self.inner.push(batch);
}
removed_containers
}
```
### Cancellation at Each Stage
| Stage | Mechanism | Behavior |
|-------|-----------|----------|
| PolicyEvaluator | Token check | Check `is_cancelled()` between block evaluations |
| PreconditionAwaiter | `select!` | Immediate drop if cancelled while waiting |
| Batcher Queue | CancellableQueue | Sweep removes cancelled containers |
| Executor Queue | CancellableQueue | Sweep removes cancelled containers from batches |
| TransferExecutor | Final sweep | `batch.sweep_cancelled()` before upgrade |
### Cancellation Boundary at Upgrade
```text
┌─────────────────────────────────────────────────────────────────────────┐
│ CANCELLABLE ZONE │
│ │
│ Enqueue → PolicyEval → PrecondAwaiter → Batcher → ExecutorQueue │
│ │
└─────────────────────────────────────────────────────────────────────────┘
┌───────────────────┐
│ sweep_cancelled │
│ (last check) │
└───────────────────┘
═══════════════════════════════════════════════════════════════════════════
UPGRADE BOUNDARY
═══════════════════════════════════════════════════════════════════════════
┌─────────────────────────────────────────────────────────────────────────┐
│ COMMITTED ZONE │
│ │
│ Upgrade → Flat Map → Transfer │
│ │
└─────────────────────────────────────────────────────────────────────────┘
```
---
## TransferExecutor Design
### Sweep → Upgrade → Flat Map → Transfer
```rust,ignore
impl<T: BlockMetadata, D: TransferDestination> TransferExecutor<T, D> {
async fn run(self) {
while let Some(mut batch) = self.input_queue.pop() {
// 1. SWEEP: Last cancellation check
batch.sweep_cancelled();
if batch.is_empty() {
continue;
}
// 2. UPGRADE: Weak → Strong (commitment point)
let upgraded: Vec<UpgradedContainer<T>> = batch
.containers
.into_iter()
.filter_map(|c| c.upgrade())
.collect();
if upgraded.is_empty() {
continue;
}
// 3. FLAT MAP: Consolidate into single vec
let all_blocks: Vec<ImmutableBlock<T>> = upgraded
.into_iter()
.flat_map(|c| c.blocks)
.collect();
// 4. TRANSFER: Execute via destination
self.destination.execute_transfer(all_blocks).await;
}
}
}
```
### Generic TransferDestination Trait
```rust,ignore
trait TransferDestination {
type Output;
async fn execute_transfer(
&self,
blocks: Vec<ImmutableBlock<T>>,
src_layout: LogicalLayoutHandle,
) -> Result<Self::Output>;
}
```
### Block Destination (G2, G3)
For transfers to another `BlockManager`:
```rust,ignore
struct BlockDestination<Dst: BlockMetadata> {
leader: Arc<InstanceLeader>,
dst_manager: Arc<BlockManager<Dst>>,
src_layout: LogicalLayoutHandle,
dst_layout: LogicalLayoutHandle,
}
impl<Dst: BlockMetadata> TransferDestination for BlockDestination<Dst> {
type Output = Vec<ImmutableBlock<Dst>>;
async fn execute_transfer(&self, blocks: Vec<ImmutableBlock<_>>) -> Result<Self::Output> {
// 1. Allocate destination blocks
let dst_blocks = self.dst_manager.allocate_blocks(blocks.len())?;
// 2. Execute transfer via leader
let notification = self.leader.execute_local_transfer(
self.src_layout,
self.dst_layout,
src_block_ids,
dst_block_ids,
)?;
notification.await?;
// 3. Register destination blocks
let registered = dst_blocks.into_iter()
.zip(sequence_hashes)
.map(|(block, hash)| self.dst_manager.register_with_hash(block, hash))
.collect();
Ok(registered)
}
}
```
### Object Destination (G4)
For transfers to object storage:
```rust,ignore
struct ObjectDestination {
object_ops: Arc<dyn ObjectBlockOps>,
src_layout: LogicalLayoutHandle,
lock_manager: Option<Arc<dyn ObjectLockManager>>,
}
impl TransferDestination for ObjectDestination {
type Output = Vec<SequenceHash>;
async fn execute_transfer(&self, blocks: Vec<ImmutableBlock<_>>) -> Result<Self::Output> {
// 1. Extract keys and block IDs
let keys: Vec<SequenceHash> = blocks.iter().map(|b| b.sequence_hash()).collect();
let block_ids: Vec<BlockId> = blocks.iter().map(|b| b.block_id()).collect();
// 2. Execute object put
let results = self.object_ops.put_blocks(keys.clone(), self.src_layout, block_ids).await;
// 3. Handle lock management
if let Some(lock_manager) = &self.lock_manager {
for hash in &successful_hashes {
lock_manager.create_meta(*hash).await?;
lock_manager.release_lock(*hash).await?;
}
}
Ok(successful_hashes)
}
}
```
---
## Batcher Design
### Grouping Containers
The batcher accumulates containers and flushes when:
- Total blocks reach `max_batch_size`
- Flush interval expires and `min_batch_size` is met
- All blocks for a transfer have been processed (sentinel flush)
```rust,ignore
struct Batcher<T: BlockMetadata> {
config: BatchConfig,
input_queue: Arc<CancellableQueue<OffloadContainer<T>>>,
output_queue: Arc<CancellableQueue<OffloadBatch<T>>>,
current_batch: OffloadBatch<T>,
}
impl<T: BlockMetadata> Batcher<T> {
async fn run(mut self) {
let mut flush_timer = tokio::time::interval(self.config.flush_interval);
loop {
tokio::select! {
_ = flush_timer.tick() => {
self.try_flush().await;
}
Some(container) = self.input_queue.pop_valid() => {
self.current_batch.containers.push(container);
if self.current_batch.total_blocks() >= self.config.max_batch_size {
self.flush().await;
}
}
}
}
}
async fn try_flush(&mut self) {
if self.current_batch.total_blocks() >= self.config.min_batch_size {
self.flush().await;
}
}
async fn flush(&mut self) {
if self.current_batch.is_empty() {
return;
}
let batch = std::mem::replace(
&mut self.current_batch,
OffloadBatch { containers: Vec::new() },
);
self.output_queue.push(batch);
}
}
```
### Preserving Per-Container Cancellability
Each container retains its own `cancel_token`. When the batch is in the executor queue:
1. **Sweep at queue level**: Removes cancelled containers from batches
2. **Sweep at executor**: Final check before upgrade
3. **Partial cancellation**: Some containers may be cancelled while others proceed
---
## Extension Rules
### Adding a New Policy
1. Implement the `OffloadPolicy` trait
2. Add to pipeline configuration
3. Policy must be fast or async-compatible
```rust,ignore
trait OffloadPolicy<T: BlockMetadata>: Send + Sync {
fn name(&self) -> &str;
fn evaluate(&self, ctx: &EvalContext<T>) -> impl Future<Output = Result<bool>>;
}
```
### Adding a New Destination Type
1. Implement `TransferDestination` trait
2. Create a new pipeline variant or use generic executor
3. Handle destination-specific registration/cleanup
### Maintaining Cancellation Invariants
When modifying the pipeline:
1. **Never skip the upgrade boundary** - It's the commitment point
2. **Always sweep before upgrade** - Last chance to cancel
3. **Token must travel with container** - Don't strip it prematurely
4. **Batches preserve container identity** - Until flat map
---
## Testing Guidance
### Unit Tests
- Test each stage in isolation
- Mock `CancellationToken` for cancel scenarios
- Verify sweep removes correct items
### Integration Tests
- Test full pipeline with cancel at each stage
- Verify no orphaned blocks after cancellation
- Test partial batch cancellation
### Performance Tests
- Measure overhead of cancellation checks
- Benchmark sweep operation at scale
- Profile upgrade → flat map → transfer path
# Offload Module
The offload module manages the asynchronous transfer of KV cache blocks between storage tiers. It provides a pipeline-based architecture for evaluating, batching, and executing block transfers with full cancellation support.
## Overview
Offloading moves blocks from a source tier (e.g., GPU memory) to a destination tier (e.g., host memory, remote storage, or object storage). The pipeline ensures:
- **Policy-based filtering**: Only blocks meeting criteria are transferred
- **Batched execution**: Blocks are grouped for efficient transfer
- **Cancellation support**: Transfers can be cancelled at any point before commitment
- **Precondition synchronization**: Transfers wait for forward pass completion
## Pipeline Architecture
```text
┌─────────────────┐ ┌─────────────────────┐ ┌─────────────────────┐ ┌──────────────────┐
│ PolicyEvaluator │────►│ PreconditionAwaiter │────►│ Batcher │────►│ TransferExecutor │
└─────────────────┘ └─────────────────────┘ └─────────────────────┘ └──────────────────┘
▲ ▲
│ │
CancellableQueue CancellableQueue
│ │
└──────── CancelSweeper ───┘
```
### Stages
| Stage | Purpose |
|-------|---------|
| **PolicyEvaluator** | Filters blocks based on configured policies (frequency, presence, etc.) |
| **PreconditionAwaiter** | Waits for forward pass completion before proceeding |
| **Batcher** | Groups containers into batches based on total block count |
| **TransferExecutor** | Upgrades blocks and executes the actual transfer |
## Container Data Model
The fundamental unit flowing through the pipeline is an **OffloadContainer**:
```rust,ignore
struct OffloadContainer<T: BlockMetadata> {
/// The blocks to offload
blocks: Vec<SourceBlock<T>>,
/// Precondition event (forward pass completion)
precondition: Option<EventHandle>,
/// Cancellation token
cancel_token: CancellationToken,
}
```
Containers are grouped into batches for efficient transfer:
```rust,ignore
struct OffloadBatch<T: BlockMetadata> {
/// Multiple containers, each independently cancellable
containers: Vec<OffloadContainer<T>>,
}
```
### P1: Container is the Unit of Cancellation
Individual blocks within a container are not independently cancellable. When a container is cancelled, all its blocks are cancelled together.
### P2: Token Travels with Container
Each container carries its own `CancellationToken`, cloned from the `TransferHandle` at enqueue time. The token travels with the container through all pipeline stages until upgrade.
### P3: Upgrade is the Commitment Boundary
The upgrade step (Weak → Strong) is the point of no return:
- **Before upgrade**: Containers can be cancelled via sweep or token check
- **After upgrade**: We own the blocks; cancellation no longer applies
### P4: Sweep Before Upgrade
The last cancellation check occurs immediately before upgrade. The `TransferExecutor` calls `batch.sweep_cancelled()` to remove cancelled containers before committing.
### P5: Flat Map After Upgrade
After upgrade, all blocks from all containers are consolidated into a single `Vec<ImmutableBlock<T>>` for efficient batch transfer. Per-container identity is lost at this point.
### P6: PreconditionAwaiter Uses Select
The precondition awaiter can be cancelled via `select!` on both the precondition event and the cancellation token. If cancelled while waiting, the container is dropped immediately.
## Configuration
Pipeline behavior is controlled via `PipelineConfig`:
| Option | Default | Description |
|--------|---------|-------------|
| `batch_config.max_batch_size` | 64 | Maximum blocks per batch |
| `batch_config.min_batch_size` | 8 | Minimum blocks before flush |
| `batch_config.flush_interval` | 10ms | Time before flushing partial batch |
| `policy_timeout` | 100ms | Timeout for policy evaluation |
| `sweep_interval` | 10ms | Interval for cancel sweeper |
| `max_concurrent_transfers` | 1 | Concurrent transfer batches |
## Usage
### Enqueueing Blocks
```rust,ignore
let handle = pipeline.enqueue(source_blocks, precondition_event);
// Track progress
println!("Status: {:?}", handle.status());
// Wait for completion
let result = handle.wait().await?;
```
### Cancelling a Transfer
```rust,ignore
// Request cancellation and wait for confirmation
handle.cancel().await;
// All blocks are now released
```
## Related Documentation
- [offload-developer.md](offload-developer.md) - Implementation details and extension rules
# Onboarding Guide
Welcome to `kvbm-engine`. This document walks you through the core abstractions
in the crate so you can orient yourself quickly and start contributing.
`kvbm-engine` is the distributed coordination layer for KV cache block management
(KVBM). It sits above `kvbm-physical` (which moves bytes) and `kvbm-logical`
(which tracks block metadata), stitching them together into a system where
**leaders** make decisions about blocks and **workers** execute data transfers
across a tiered storage hierarchy:
```text
G1 (GPU HBM) → G2 (Pinned DRAM) → G3 (NVMe/SSD) → G4 (S3/MinIO)
```
The central design tension is between **logical** and **physical**. Leaders think
in sequence hashes and block identities — they never touch raw memory. Workers
think in layout handles, transfer managers, and DMA descriptors — they never make
placement decisions. The engine holds these two worlds together.
---
## The Worker
A worker is the physical side of the logical-physical dichotomy. The core
implementation is `PhysicalWorker`, a thin coordination wrapper around
`kvbm-physical`.
A `PhysicalWorker` owns:
- A **`TransferManager`** — the `kvbm-physical` engine that actually moves data
between memory regions via NIXL (RDMA/UCX), NVMe, or object storage APIs.
- **Layout handles** for up to three tiers (`g1_handle`, `g2_handle`,
`g3_handle`) — these are physical memory region registrations that the transfer
manager uses to know *where* data lives on this process.
- A map of **remote handles** — physical handles imported from peer workers,
enabling RDMA pulls.
Workers implement two traits:
**`WorkerTransfers`** defines the transfer operations:
- `execute_local_transfer(src, dst, block_ids, ...)` — move blocks between tiers
within this worker (e.g. G2 → G1).
- `execute_remote_onboard(remote_desc, dst, block_ids, ...)` — RDMA pull from a
remote worker into a local layout.
- `execute_remote_offload(src, remote_desc, block_ids, ...)` — push local data
to a remote descriptor.
- `connect_remote(instance_id, metadata)` — import a peer's NIXL metadata so we
can do RDMA to/from them.
**`Worker`** extends `WorkerTransfers` with layout handle accessors and metadata
import/export for RDMA setup.
All transfer operations return a `TransferCompleteNotification` — an async handle
you await to know when the data movement is done. This is how the system achieves
overlap between transfer scheduling and transfer execution.
---
## Workers as Remote Services (Velo)
In a multi-process deployment, each worker runs in its own process. Rather than
calling `PhysicalWorker` methods directly, we wrap it as a Velo RPC service.
**`VeloWorkerService`** takes a `PhysicalWorker` and registers handlers for
every `WorkerTransfers` and `Worker` method (e.g. `kvbm.worker.local_transfer`,
`kvbm.worker.remote_onboard`, etc.). The service lives in the worker process.
**`VeloWorkerClient`** implements the same `Worker` trait but serializes each
call into a Velo message, sends it to the remote service, and returns a
`TransferCompleteNotification` backed by a completion event.
The key insight: **from the leader's perspective, local and remote workers are
interchangeable.** Both implement `Worker`. The leader never knows (or cares)
whether it is talking to an in-process `PhysicalWorker` or a `VeloWorkerClient`
that crosses a process boundary.
```text
Leader process Worker process
┌───────────────────┐ ┌───────────────────┐
│ InstanceLeader │ │ │
│ │ │ │ │
│ CoordinatedWorker│ │ │
│ │ │ │ │
│ VeloWorkerClient │ ── Velo RPC ──▶ │ VeloWorkerService │
│ │ │ │ │
│ │ │ PhysicalWorker │
│ │ │ (TransferManager)│
└───────────────────┘ └───────────────────┘
```
There is one more wrapper to mention: **`CoordinatedWorker`**. This lives in the
leader process and adds coordination state on top of a `Worker` (local or
remote). It tracks the leader's view of which layout handles map to which
remote instances and ranks. When the leader says "pull blocks from Instance B,
rank 0", the `CoordinatedWorker` resolves the correct physical handle and
delegates to the inner `Worker`.
---
## Worker Groups
Workers can be organized into groups that present a single-worker interface to
the leader. The `ParallelWorkers` trait is the group-level analog of `Worker`.
### Tensor Parallel (SPMD)
`SpmdParallelWorkers` is the default group implementation. It broadcasts every
operation to all N workers in parallel — the SPMD (Single Program, Multiple Data)
model.
In a typical tensor-parallel deployment, each GPU holds its own shard of every
KV cache block. When the leader says "transfer blocks [1, 2, 3] from G2 to G1",
the SPMD group fans this out to every rank. Each rank executes the same transfer
on its own shard. Results are aggregated before returning to the leader.
```text
Leader: "transfer blocks 1,2,3 from G2 → G1"
SpmdParallelWorkers
┌────┼────┐
▼ ▼ ▼
Rank0 Rank1 Rank2 (each transfers its own shard)
```
### Replicated Data (MLA)
For Multi-head Latent Attention (MLA), KV data is replicated rather than sharded.
The `ReplicatedDataWorker` (feature-gated behind `collectives`) implements a
different strategy:
- **Rank 0** is the only worker with G2 and G3 storage. It performs all
tier-to-tier transfers (G3 → G2 → G1).
- **Ranks 1..N** only have G1. They receive data from rank 0 via NCCL
`broadcast`.
This means the leader can still say "onboard these blocks" and the group handles
the asymmetry internally — rank 0 does the heavy lifting, then broadcasts to
everyone else.
### The Power of the Abstraction
These two strategies — symmetric sharding and replicated broadcast — are very
different physically, but the leader drives both through the same
`ParallelWorkers` / `WorkerTransfers` interface. This is the core value of worker
groups: **different parallelism strategies behind a uniform API**.
The abstraction is admittedly incomplete — more parallelism patterns will need
more group implementations — but it is sufficient for the two use cases presented
and demonstrates the pattern for extending it.
---
## The Leader
The leader is the logical counterpart to the worker. `InstanceLeader` owns the
logical view of all block data, regardless of how it is physically distributed
across workers and tiers.
An `InstanceLeader` holds:
- A **`BlockRegistry`** for deduplication — tracks which sequence hashes have
been seen.
- A **`BlockManager<G2>`** (required) and optional **`BlockManager<G3>`** — the
logical block stores for host DRAM and disk.
- A list of **workers** (via `CoordinatedWorker`) and an optional
**`SpmdParallelWorkers`** group.
- A map of **sessions** for distributed onboarding (more on this below).
- Optional **remote leader** references for cross-instance coordination.
### find_matches
The core entry point is `find_matches(sequence_hashes)`. Given a list of
sequence hashes, the leader determines which blocks already exist and where:
1. Search the local G2 `BlockManager` for matches.
2. Search the local G3 `BlockManager` for any remaining hashes.
3. Optionally search remote leaders via distributed sessions.
The result is either:
- **`Ready`** — all requested blocks were found locally in G2; the caller gets
immediate RAII `BlockHolder`s.
- **`AsyncSession`** — some blocks require staging (G3 → G2) or remote transfers;
the caller gets a session handle with a status watch channel.
### BlockHolder (RAII Ownership)
`BlockHolder<T>` (where T is `G2` or `G3`) is an RAII guard that holds blocks
during a session. While held, those blocks cannot be evicted. When the holder is
dropped, blocks are released. This prevents leaks even if session handling
panics.
### Block Scanning
`InstanceLeader` also exposes `scan_with_policy` — a flexible iteration
mechanism where the caller provides a closure that searches for blocks using a
`BlockAccessor` (which wraps both G2 and G3 managers) and yields results through
a `PolicyContext`. This enables custom scanning strategies (contiguous runs,
LFU-sorted scans) without exposing block manager internals.
---
## Instances
An **Instance** is the deployment unit: one leader plus its workers.
```text
┌─ Instance (TP=2) ──────────────────────────┐
│ │
│ InstanceLeader │
│ │ │
│ SpmdParallelWorkers │
│ ├── Worker (rank 0, GPU 0) │
│ └── Worker (rank 1, GPU 1) │
│ │
└─────────────────────────────────────────────┘
```
In a single-GPU setup, the instance is simply one leader and one worker.
In tensor-parallel, it is one leader driving an SPMD group.
The leader drives; the workers execute. The leader never touches bytes; the
workers never make placement decisions.
---
## Transfer Classification
Transfers fall into three classes based on scope:
### Local (intra-worker, intra-instance)
Tier-to-tier transfers within a single worker: G1 ↔ G2, G2 ↔ G3, etc.
This is the bread and butter of a tensor-parallel deployment. Each worker
independently moves its own shard between tiers. The SPMD group broadcasts the
same logical operation to all ranks, and each rank executes it on its own
physical layouts.
### Intra (inter-worker, intra-instance)
Transfers between workers within the same instance. The motivating example is the
MLA/replicated data pattern: rank 0 performs a G3 → G2 → G1 transfer, then
NCCL broadcasts its G1 data to all other ranks. The data crosses worker
boundaries but stays within the same instance.
### Inter (inter-worker, inter-instance)
Transfers between workers on different instances. This is **distributed KVBM**
the peer-to-peer model described in the next section.
```text
┌──────────────────────────────┐
│ Local │
│ (intra-worker, intra-inst) │
│ G2 ←→ G1 on Rank 0 │
└──────────────────────────────┘
┌──────────────────────────────────────────────┐
│ Intra │
│ (inter-worker, intra-inst) │
│ Rank 0 ──NCCL bcast──▶ Rank 1..N │
└──────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────┐
│ Inter │
│ (inter-worker, inter-inst) │
│ Instance A, Rank 0 ──RDMA──▶ Instance B, Rank 0 │
└──────────────────────────────────────────────────────────────┘
```
---
## Distributed KVBM (Inter-Instance Transfers)
Distributed KVBM is a peer-to-peer model where two or more instances coordinate
block ownership through **sessions**, then trigger direct worker-to-worker
transfers.
### Sessions
A session is a short-lived coordination protocol between two instances. There are
two roles:
- **`InitiatorSession`** — the requesting side (e.g. a Prefill instance that
needs blocks).
- **`ResponderSession`** — the providing side (e.g. a Decode instance that has
blocks cached).
Sessions progress through a state machine:
```text
Searching ──▶ Holding ──▶ Staging ──▶ Ready ──▶ Complete
(or Failed)
```
- **Searching**: The initiator asks the responder to search its local block
managers.
- **Holding**: The responder has found blocks and holds them via `BlockHolder` to
prevent eviction.
- **Staging**: G3 → G2 promotion is in progress on the responder (if blocks were
on disk). NIXL descriptors are prepared for RDMA.
- **Ready**: Blocks are in G2 on the responder and RDMA-ready.
- **Complete**: The initiator has pulled all blocks. The session is torn down.
### Worked Example: TP=2 Cross-Instance Transfer
Suppose Instance A (Prefill, TP=2) wants KV blocks for sequence hashes
`[hash_1, hash_2]` from Instance B (Decode, TP=2).
```text
Instance A (Prefill, TP=2) Instance B (Decode, TP=2)
┌──────────────────────┐ ┌──────────────────────┐
│ Leader A │ │ Leader B │
│ ├─ Worker A0 (GPU0) │ │ ├─ Worker B0 (GPU0) │
│ └─ Worker A1 (GPU1) │ │ └─ Worker B1 (GPU1) │
└──────────────────────┘ └──────────────────────┘
```
The flow:
1. **Leader A creates a session** with Leader B, sending the sequence hashes
`[hash_1, hash_2]` it is looking for.
2. **Leader B receives the request** (`ResponderSession`). It searches its G2
and G3 block managers for matches.
3. **Leader B acquires ownership** of the matched blocks via `BlockHolder`,
preventing eviction during the transfer.
4. **Leader B responds** with what it found: which hashes matched, their
tier locations, and NIXL descriptors that allow RDMA access to the G2 blocks.
5. **Leader A instructs its workers to pull.** Since both instances use TP=2, the
mapping is 1:1 — rank 0 on A pulls from rank 0 on B, rank 1 on A pulls from
rank 1 on B. Each pull is a direct RDMA transfer between the worker processes
using NIXL.
6. **Session completes.** Leader B releases its `BlockHolder`s. Leader A now has
the blocks in its own G2.
The rank mapping is handled by `route_local_to_remote` in `LeaderState`, which
supports asymmetric configurations too (e.g. TP=4 pulling from TP=2).
### Transport
Session messages travel over **Velo** (the project's RPC framework).
`VeloLeaderService` registers handlers for `kvbm.leader.onboard`,
`kvbm.leader.remote_session`, and `kvbm.leader.session` — these dispatch
incoming messages to the appropriate per-session channels.
For testing, `LocalTransport` provides direct in-process dispatch without
network overhead.
---
## Objects vs Blocks
Throughout the crate, you will encounter two distinct representations of
KV cache data:
### Blocks
A **Block** is the fundamental unit within tiers G1–G3. It is identified by a
`BlockId`, associated with a `SequenceHash`, and managed by a `BlockManager`.
Blocks have physical backing (GPU HBM, pinned DRAM, or NVMe) and support
direct memory transfers via NIXL. The `BlockManager` handles allocation,
eviction, and frequency tracking. Blocks are the hot-path, low-latency
representation.
### Objects
An **Object** is the G4 (S3/MinIO) representation. Objects are addressed by
**key** (derived from a `SequenceHash` via a `KeyFormatter`), not by `BlockId`.
The `ObjectBlockOps` trait defines the interface: `has_blocks`, `put_blocks`,
`get_blocks`.
Objects exist because S3 does not support the block-oriented, handle-based access
pattern of the lower tiers. They provide unlimited-capacity cold storage at the
cost of higher latency and a key-value access model.
For SPMD deployments, the `RankPrefixedKeyFormatter` prefixes each object key
with the worker rank (`{rank}/{hash}`), so each worker's shard is stored
independently.
The `ObjectLockManager` provides distributed locking for G4 writes using
conditional S3 PUTs, preventing duplicate uploads across concurrent instances.
---
## Where to Go Next
Now that you have the conceptual model, dive into the per-module documentation
for implementation details:
| Document | Covers |
|----------|--------|
| [architecture.md](architecture.md) | Tier model, module map, feature flags, quick start |
| [leader.md](leader.md) | `Leader` trait, `InstanceLeader`, `FindMatchesResult`, staging modes |
| [worker.md](worker.md) | `Worker` / `WorkerTransfers`, `PhysicalWorker`, `CoordinatedWorker`, Velo layer |
| [worker-group.md](worker-group.md) | `SpmdParallelWorkers`, fan-out, rank-aware routing |
| [session.md](session.md) | Session protocol, initiator/responder/controllable, message types, state machine |
| [offload.md](offload.md) | Offload pipeline stages, policies, cancellation |
| [object.md](object.md) | G4 storage, S3 client, lock manager |
| [runtime.md](runtime.md) | `KvbmRuntime` construction and shared infrastructure |
| [testing.md](testing.md) | Test utilities, multi-instance fixtures, RDMA transfer tests |
To run the test suite:
```bash
cargo test -p kvbm-engine --features testing
```
# Runtime
The `KvbmRuntime` is the composed shared infrastructure for KVBM operations. It bundles
the minimal set of components that all downstream managers and services need:
- **Tokio runtime** -- async execution context (owned or borrowed handle)
- **Messenger (Velo)** -- distributed RPC for leader/worker communication and peer discovery
- **NixlAgent** -- RDMA/UCX data transfers (optional, disabled when NixL config is absent)
- **EventManager** -- worker coordination and transfer completion notifications (accessed via Messenger)
## Construction
Two quick constructors cover the common case:
```rust,ignore
// Leader role (reads KVBM_* env vars + TOML files)
let runtime = KvbmRuntime::from_env_leader().await?;
// Worker role
let runtime = KvbmRuntime::from_env_worker().await?;
```
For tests or custom setups, use the builder:
```rust,ignore
let config = KvbmConfig::from_env()?;
let runtime = KvbmRuntime::builder(config)
.with_runtime_handle(Handle::current()) // inject existing tokio runtime
.with_messenger(messenger) // inject pre-built Messenger
.with_nixl_agent(agent) // inject pre-built NixlAgent
.build_leader()
.await?;
```
`KvbmRuntimeBuilder::from_json(json)` is the primary entrypoint for vLLM's
`kv_connector_extra_config` dict -- JSON values have highest priority, overriding
env vars, TOML files, and defaults.
## Component access
| Method | Returns | Notes |
|---------------------|------------------------------|---------------------------------------|
| `handle()` / `tokio()` | `tokio::runtime::Handle` | Borrowed or owned runtime handle |
| `messenger()` | `&Arc<Messenger>` | Velo RPC |
| `nixl_agent()` | `Option<&NixlAgent>` | `None` when NixL disabled in config |
| `event_system()` | `Arc<velo::EventManager>` | From Messenger, used for transfer notifications |
| `config()` | `&KvbmConfig` | Full configuration snapshot |
## RuntimeHandle
`RuntimeHandle` is an enum that abstracts over owned (`Arc<Runtime>`) and borrowed
(`Handle`) tokio runtimes. The builder creates an owned runtime from config when none
is injected.
# Session Module
The session module manages distributed block transfer sessions between
instances. Sessions coordinate the search, staging, and RDMA transfer of
KV cache blocks between a requesting instance (Prefill) and a serving
instance (Decode).
## Protocol Overview
### Onboard Protocol (InitiatorSession ↔ ResponderSession)
Multi-peer search and staging using `OnboardMessage`:
```text
Initiator (Prefill) Responder (Decode)
│ │
│──── CreateSession ────────────▶│
│ │ search local G2/G3
│◀─── G2Results ────────────────│
│◀─── G3Results ────────────────│ (if G3 blocks found)
│◀─── SearchComplete ───────────│
│ │
│──── HoldBlocks ───────────────▶│
│◀─── Acknowledged ─────────────│
│ │
│──── StageBlocks ──────────────▶│ G3→G2 staging (optional)
│◀─── BlocksReady ──────────────│
│ │
│ RDMA pull (remote G2→local G2)
│ │
│──── CloseSession ─────────────▶│
```
When G4 (object storage) is configured, the initiator also runs a parallel
G4 search via internal `G4Results`/`G4LoadComplete` messages (not sent over
the network).
### Unified Session Protocol (SessionHandle ↔ ServerSession)
Point-to-point session using `SessionMessage`:
```text
Controller (Prefill) ServerSession (Decode)
│ │
│──── Attach ───────────────────▶│
│◀─── StateResponse ────────────│ (current state snapshot)
│ │
│──── TriggerStaging ───────────▶│ (if G3 blocks pending)
│◀─── BlocksStaged ────────────│ (newly staged blocks)
│ │
│ RDMA pull (remote G2→local G2)
│ │
│──── BlocksPulled ─────────────▶│ (release pulled blocks)
│──── Detach ───────────────────▶│
```
Control can be transferred bidirectionally via `YieldControl`/`AcquireControl`.
For layerwise transfer, `BlocksStaged` includes an optional `layer_range`.
## Session Types
| Session | Role | Protocol | Description |
|---------|------|----------|-------------|
| **ServerSession** | Holds blocks, exposes for pull | SessionMessage | Merged from former EndpointSession + ControllableSession |
| **SessionHandle** | Client-side control | SessionMessage | Attach/detach, state queries, RDMA pulls |
| **InitiatorSession** | Multi-peer search orchestrator | OnboardMessage | Created by InstanceLeader for distributed search |
| **ResponderSession** | Responds to search requests | OnboardMessage | Searches local G2/G3, holds blocks, stages on request |
### ServerSession
Server-side session that holds blocks and exposes them for remote RDMA pull.
Supports two modes:
- **G2-only**: Blocks are already in G2 with pre-assigned layout handles
(`BlockMetadataMap::Direct`). `TriggerStaging` is a no-op. Created via
`ServerSession::new_g2_only()` or the `create_server_session()` factory.
- **Staging**: G3 blocks need to be staged to G2. Layout handles are assigned
round-robin across workers (`BlockMetadataMap::RoundRobin`). Supports
`auto_stage` option via `ServerSessionOptions`. Created via
`ServerSession::new_with_staging()`.
Created from `InstanceLeader` via:
- `create_endpoint_session()` / `create_endpoint_session_for_blocks()` — G2-only
- `create_controllable_session()` / `create_controllable_session_with_options()` — with staging
**ServerSessionHandle** provides local control: `notify_layers_ready()` for
layerwise transfer notifications, and `close()` for graceful shutdown.
### InitiatorSession
The requesting side. Sends `CreateSession` to one or more remote instances,
collects results, applies first-responder-wins deduplication, and orchestrates
staging and RDMA pulls. Supports three staging modes:
- **Hold**: Find and hold blocks (G2+G3), no staging
- **Prepare**: Stage G3→G2 everywhere, keep session alive
- **Full**: Stage G3→G2 + RDMA pull remote G2→local G2, session completes
Created by `InstanceLeader::find_matches_with_options()` when
`search_remote == true`.
### ResponderSession
The serving side. Receives `CreateSession`, searches local block managers
(G2 then G3 for remaining), holds matched blocks via `BlockHolder`, and
responds with match results. Handles staging requests and keeps blocks
alive until the session ends.
## Core Building Blocks
### BlockHolder
RAII container for holding blocks during sessions. Tier-agnostic (`BlockHolder<G2>`,
`BlockHolder<G3>`). Blocks are automatically released when the holder is dropped,
preventing leaks even if session handling panics. Key operations: `retain()`,
`release()`, `extend()`, `take_all()`.
### SessionEndpoint
Point-to-point session primitive with a state machine. Encapsulates:
- Identity (`session_id`, `instance_id`)
- State machine (`ControlRole` + `AttachmentState` + `SessionPhase`)
- Message receive channel (`mpsc::Receiver<SessionMessage>`)
- State publication via watch channel
- Transport for sending messages to peer
Used internally by `ServerSession`. Does NOT handle block holding or staging
logic — those are the caller's responsibility.
### SessionHandle
Handle for controlling a remote session. Supports:
- State observation: `current_state()`, `wait_for_ready()`, `wait_for_complete()`
- Control commands: `trigger_staging()`, `mark_blocks_pulled()`, `detach()`
- Bidirectional control: `yield_control()`, `acquire_control()`
- RDMA transfers: `pull_blocks_rdma()`, `pull_blocks_rdma_with_options()`
Used by the controller side (Prefill) to drive a remote `ServerSession` (Decode).
### SessionHandleStateTx
Sender side of the state observation channel. Used by the session receiver
task to forward `StateResponse` and `BlocksStaged` messages into the
watch channel that `SessionHandle` observes.
### Staging
Shared G3→G2 staging logic extracted into `staging::stage_g3_to_g2()`.
Core kernel: allocate G2 destinations → execute local transfer (G3→G2) →
register new blocks with source sequence hashes. Used by `InitiatorSession`,
`ResponderSession`, and `ServerSession` to avoid code duplication.
## Transport Layer
`MessageTransport` is an enum with two variants:
- **`VeloTransport`**: Uses Velo active messages for distributed
communication between instances.
- **`LocalTransport`**: Direct channel dispatch for in-process testing
without network overhead.
Methods:
- `send()` — Send an `OnboardMessage` to a target instance
- `send_session()` — Send a `SessionMessage` to a target instance
- `request_metadata()` — RPC call to get remote worker layout metadata for RDMA
## Message Types
| Type | Direction | Purpose |
|------|-----------|---------|
| `OnboardMessage` | Initiator ↔ Responder | Block search, hold/drop, staging requests |
| `SessionMessage` | Controller ↔ ServerSession | Attach/detach, control transfer, block ops, state sync |
### OnboardMessage Variants
| Variant | Sender | Description |
|---------|--------|-------------|
| `CreateSession` | Initiator | Start new session with sequence hashes |
| `G2Results` | Responder | G2 search matches (sequence hashes + block IDs) |
| `G3Results` | Responder | G3 search matches (sequence hashes only) |
| `SearchComplete` | Responder | All local searching done |
| `HoldBlocks` | Initiator | Which blocks to hold vs drop |
| `Acknowledged` | Responder | Hold/drop processed |
| `StageBlocks` | Initiator | G3 hashes to stage to G2 |
| `BlocksReady` | Responder | Newly staged G2 blocks ready |
| `ReleaseBlocks` | Initiator | Release specific blocks |
| `CloseSession` | Initiator | Session complete, clean up |
| `G4Results` | Internal | Object storage search results (not sent over network) |
| `G4LoadComplete` | Internal | Object storage load results (not sent over network) |
### SessionMessage Variants
| Variant | Category | Description |
|---------|----------|-------------|
| `Attach` | Connection | Peer attaches with a control role |
| `Detach` | Connection | Peer detaches gracefully |
| `YieldControl` | Control | Sender yields controller role |
| `AcquireControl` | Control | Sender acquires controller role |
| `TriggerStaging` | Block ops | Request G3→G2 staging |
| `HoldBlocks` | Block ops | Request blocks be held |
| `ReleaseBlocks` | Block ops | Release specific blocks |
| `BlocksPulled` | Block ops | Notify blocks were pulled via RDMA |
| `StateResponse` | State sync | Full state snapshot (phase, role, blocks) |
| `BlocksStaged` | State sync | Newly staged blocks (with optional layer range) |
| `Close` | Lifecycle | Graceful session close |
| `Error` | Lifecycle | Report error |
## State Machine
### SessionPhase
Lifecycle of block operations. Staging is optional — blocks already in the
target tier (G2) skip it:
```text
Searching → Holding ──────────────────── Ready → Complete
└── Staging ────────┘
└── Complete (direct pull, no staging needed)
└── Failed
```
### ControlRole
Dynamic role in session relationship:
- `Neutral` — Initial state, can transition either way
- `Controller` — Issues commands to peer
- `Controllee` — Executes commands from peer
Supports bidirectional transfer via `YieldControl`/`AcquireControl`.
### AttachmentState
Peer connection state: `Unattached` (waiting) or `Attached { peer }` (connected).
## Dispatch Functions
- **`dispatch_onboard_message`**: Routes `OnboardMessage` to per-session task
channels by session ID. Used by the Velo onboard handler.
- **`dispatch_session_message`**: Routes `SessionMessage` to per-session task
channels by session ID. Used by the Velo session handler.
## File Structure
```text
session/
├── mod.rs # Module declarations, dispatch functions, re-exports
├── blocks.rs # BlockHolder<T> — RAII block container
├── endpoint.rs # SessionEndpoint — state machine primitive
├── handle.rs # SessionHandle + SessionHandleStateTx
├── server_session.rs # ServerSession + ServerSessionHandle + BlockMetadataMap
├── staging.rs # Shared stage_g3_to_g2() function
├── state.rs # SessionPhase, ControlRole, AttachmentState
├── messages.rs # OnboardMessage, SessionMessage, BlockInfo, etc.
├── initiator.rs # InitiatorSession (multi-peer orchestrator)
├── responder.rs # ResponderSession (search + hold + stage)
└── transport.rs # MessageTransport (Velo + Local)
```
# Testing Module
Test infrastructure for the kvbm-engine crate. Core block and token utilities
are re-exported from `kvbm_logical::testing` and `kvbm_physical::testing`;
this module adds engine-specific helpers for transport, sessions, offload
pipelines, and multi-instance scenarios.
## Test Helpers
### TestManagerBuilder / TestRegistryBuilder
Create test block managers and registries with synthetic physical layouts.
`TestManagerBuilder` produces a `BlockManager<T>` backed by mock memory.
`TestRegistryBuilder` produces a `BlockRegistry` pre-populated with hashes.
Use `populate_manager_with_blocks` and `create_and_populate_manager` to
quickly set up managers with pre-allocated blocks for testing.
### MessengerPair
Creates a pair of connected Velo `Messenger` instances for transport
testing without a real network. Messages sent through one messenger are
received by the other, enabling end-to-end session testing in a single
process.
```rust,ignore
let (messenger_a, messenger_b) = create_messenger_pair_tcp().await?;
```
### TestSession
Helper for testing distributed session protocols. Sets up the full session
infrastructure (dispatch maps, transport, channels) for testing
`InitiatorSession` / `ResponderSession` / `ControllableSession` interactions.
### EventsPipelineFixture
Test fixture for the offload pipeline. Provides pre-configured pipeline
stages, event managers, and block managers for testing policy evaluation,
batching, and transfer execution in isolation.
### MultiInstancePopulator
Sets up multi-instance distributed test scenarios with multiple leaders,
workers, and block managers. Populates each instance with configurable
block patterns for testing cross-instance onboarding.
```rust,ignore
let populated = MultiInstancePopulator::builder()
.instance_count(3)
.blocks_per_instance(100)
.build()?
.populate()
.await?;
```
### Physical Test Utilities
`TestAgent` and `TestAgentBuilder` create mock `NixlAgent` instances for
testing `TransferManager` without real RDMA hardware. `TransferChecksums`
provides utilities for verifying transfer correctness.
### Token Block Helpers
The `token_blocks` module provides utilities for creating test blocks with
known token sequences, useful for verifying search and match operations.
## Writing a New Test
1. Choose the appropriate fixture for your test scope:
- Single-instance transfer → `TestManagerBuilder` + `TestAgent`
- Session protocol → `TestSession` + `MessengerPair`
- Offload pipeline → `EventsPipelineFixture`
- Multi-instance → `MultiInstancePopulator`
2. Build the fixture and populate with test data
3. Exercise the code under test
4. Assert on results and verify cleanup (blocks released, sessions closed)
# Worker Group Module
The worker group module provides abstractions for driving multiple workers
in parallel from a single leader.
## ParallelWorkers Trait
`ParallelWorkers` extends `WorkerTransfers + ObjectBlockOps` for cohorts of
workers. It adds:
- `export_metadata()``Vec<SerializedLayoutResponse>` (one per rank)
- `import_metadata(Vec<SerializedLayout>)``Vec<ImportMetadataResponse>`
- `worker_count()` → number of workers
- `workers()` → slice of underlying `Arc<dyn Worker>`
## SpmdParallelWorkers
`SpmdParallelWorkers` implements the SPMD (Single Program, Multiple Data)
execution model: the same operation is broadcast to every worker in parallel,
and results are aggregated.
### Fan-out Execution
Every `WorkerTransfers` method (local transfer, remote onboard, remote
offload) iterates over all workers and calls the same operation on each.
Workers execute in parallel – each resolves the shared logical layout handle
to its own physical layout.
### Rank-aware Routing
For `connect_remote`, each worker receives its rank-specific metadata slice.
Remote handle mappings are stored as `(InstanceId, worker_idx,
LogicalLayoutHandle) → LayoutHandle`, so `execute_remote_onboard_for_instance`
can look up the correct remote handle for each worker by rank.
### Event Aggregation
Transfer completion notifications from individual workers are aggregated into
a single `TransferCompleteNotification` via the event system. The aggregated
notification fires only when all workers have completed.
### ObjectBlockOps Aggregation
- `has_blocks`: queries all workers, returns results from worker 0 (all should
agree in SPMD semantics).
- `put_blocks` / `get_blocks`: executes on all workers in parallel. A key
succeeds only if **all** workers succeed for that key.
### Construction
```rust,ignore
let parallel = SpmdParallelWorkers::new(
workers, // Vec<Arc<dyn Worker>>, one per rank
event_manager, // Arc<EventManager> for aggregation
runtime_handle, // tokio::runtime::Handle for spawning
);
```
# Worker Module
The worker module defines execution primitives for data transfers between
storage tiers. Workers own the physical resources (transfer managers, layout
handles) needed to move blocks via RDMA, local copy, or object storage.
## Trait Hierarchy
```text
WorkerTransfers Worker
(execution) (metadata + handles)
│ │
└────────┬───────────┘
ObjectBlockOps
(G4 storage)
```
- **`WorkerTransfers`** – core execution trait. Provides `execute_local_transfer`,
`execute_remote_onboard`, `execute_remote_offload`, `connect_remote`, and
`execute_remote_onboard_for_instance`.
- **`Worker`** – extends `WorkerTransfers + ObjectBlockOps`. Adds layout handle
accessors (`g1_handle`, `g2_handle`, `g3_handle`) and metadata import/export.
## PhysicalWorker (aka DirectWorker)
`PhysicalWorker` is the fundamental single-worker implementation. It directly
owns a `TransferManager` and layout handles for executing data movement.
### Builder
```rust,ignore
let worker = PhysicalWorker::builder()
.manager(transfer_manager) // required
.g1_handle(g1) // optional – GPU tier
.g2_handle(g2) // optional – host tier
.g3_handle(g3) // optional – disk tier
.rank(0) // optional – for SPMD key prefixing
.object_client(s3_client) // optional – for G4 operations
.build()?;
```
| Field | Required | Purpose |
|-------|----------|---------|
| `manager` | yes | `TransferManager` for executing transfers |
| `g1_handle` | no | GPU/HBM layout handle |
| `g2_handle` | no | Host/pinned-DRAM layout handle |
| `g3_handle` | no | Disk/NVMe layout handle |
| `rank` | no | Worker rank for SPMD key prefixing |
| `object_client` | no | G4 object storage client |
`DirectWorker` is a compatibility alias for `PhysicalWorker`.
### Execution State vs Coordination State
PhysicalWorker maintains **execution state** – the handles and manager needed
to actually perform transfers. This is distinct from **coordination state**
which the leader tracks in `CoordinatedWorker`. When a leader wraps a
PhysicalWorker in a CoordinatedWorker, handles exist in both places
intentionally: PhysicalWorker needs them to call TransferManager, while
CoordinatedWorker provides a uniform API for both local and remote workers.
## CoordinatedWorker
`CoordinatedWorker` is the leader's view of a worker. It wraps any `Worker`
implementation and adds coordination state:
- Local layout handles (populated via `apply_layout_response`)
- Remote handle mappings for cross-leader RDMA transfers
- Worker rank and host instance tracking
This wrapper lets the leader use the same API regardless of whether the
underlying worker is local (`PhysicalWorker`) or remote (`VeloWorkerClient`).
## VeloWorkerClient / VeloWorkerService
The Velo (RPC) layer enables remote worker execution:
- **`VeloWorkerService`** – wraps a `PhysicalWorker` and exposes RPC handlers
for `execute_local_transfer`, `export_metadata`, `import_metadata`, etc.
- **`VeloWorkerClient`** – implements `WorkerTransfers` by sending RPC
requests to a remote `VeloWorkerService`.
Together they allow the leader to drive workers on remote nodes as if they
were local.
<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
<!-- SPDX-License-Identifier: Apache-2.0 -->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>KVBM Bench Engine Results Viewer</title>
<style>
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
:root{
--bg:#f9f9f7;--card:#fff;--border:#e0e0e0;--text:#1a1a1a;--text2:#555;
--green:#76B900;--green-dark:#4A7A00;--blue:#0074DF;--blue-dark:#1565C0;
--orange:#EF9100;--orange-dark:#C67800;--red:#E52020;--red-dark:#C62828;
--nav:#1a1a1a;--radius:16px;--font:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
--mono:'SF Mono','Cascadia Code',Consolas,monospace;
}
html{scroll-behavior:smooth;scroll-padding-top:56px}
body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5}
/* Nav */
nav{position:sticky;top:0;z-index:100;background:var(--nav);display:flex;align-items:center;gap:4px;padding:0 16px;height:48px;overflow-x:auto;white-space:nowrap}
nav a{color:#ccc;text-decoration:none;font-size:13px;padding:6px 10px;border-radius:6px;transition:color .15s,background .15s}
nav a:hover{color:#fff;background:rgba(255,255,255,.08)}
nav a.active{color:#fff;background:rgba(118,185,0,.25);border-bottom:2px solid var(--green)}
nav .spacer{flex:1}
nav select{background:#333;color:#fff;border:1px solid #555;border-radius:6px;padding:4px 8px;font-size:13px;font-family:var(--font)}
nav .brand{color:#fff;font-weight:700;font-size:14px;margin-right:12px;letter-spacing:.3px}
/* Header / data loading */
.header{max-width:1200px;margin:0 auto;padding:24px 20px}
.header h1{font-size:22px;font-weight:700;margin-bottom:4px}
.header p{color:var(--text2);font-size:14px;margin-bottom:16px}
.drop-zone{border:2px dashed var(--border);border-radius:var(--radius);padding:40px 20px;text-align:center;transition:border-color .2s,background .2s;cursor:pointer}
.drop-zone.over{border-color:var(--green);background:rgba(118,185,0,.06)}
.drop-zone h3{font-size:16px;margin-bottom:8px}
.drop-zone p{color:var(--text2);font-size:13px}
.load-row{display:flex;gap:12px;margin-top:12px;align-items:center;flex-wrap:wrap}
.load-row input[type=file]{font-size:13px}
.load-row textarea{flex:1;min-width:200px;height:60px;font-family:var(--mono);font-size:12px;padding:8px;border:1px solid var(--border);border-radius:8px;resize:vertical}
.btn{display:inline-flex;align-items:center;gap:4px;padding:6px 14px;font-size:13px;font-family:var(--font);border:none;border-radius:8px;cursor:pointer;transition:background .15s}
.btn-green{background:var(--green);color:#fff}.btn-green:hover{background:#6aa600}
.btn-sm{padding:4px 10px;font-size:12px;border-radius:6px}
.loaded-bar{display:flex;align-items:center;gap:12px;padding:10px 16px;background:var(--card);border:1px solid var(--border);border-radius:10px}
.loaded-bar .fname{font-family:var(--mono);font-size:13px;color:var(--green)}
.loaded-bar .count{font-size:13px;color:var(--text2)}
/* Main */
main{max-width:1200px;margin:0 auto;padding:0 20px 40px;display:none}
section{margin-bottom:36px}
section h2{font-size:18px;font-weight:700;margin-bottom:4px}
section .desc{color:var(--text2);font-size:13px;margin-bottom:14px}
.card{background:var(--card);border:1px solid var(--border);border-radius:var(--radius);padding:20px;transition:box-shadow .2s}
.card:hover{box-shadow:0 4px 16px rgba(0,0,0,.06)}
/* KPI cards */
.kpi-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(220px,1fr));gap:14px}
.kpi{border-left:3px solid var(--green);padding:14px 16px}
.kpi.blue{border-left-color:var(--blue)}.kpi.amber{border-left-color:var(--orange)}
.kpi .label{font-size:12px;color:var(--text2);text-transform:uppercase;letter-spacing:.5px}
.kpi .value{font-size:26px;font-weight:700;margin:2px 0}
.kpi .sub{font-size:12px;color:var(--text2)}
/* Pills */
.pills{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
.pill{padding:4px 12px;font-size:12px;border:1px solid var(--border);border-radius:20px;cursor:pointer;background:var(--card);transition:all .15s;font-family:var(--font)}
.pill.active{background:var(--green);color:#fff;border-color:var(--green)}
/* Chart container */
.chart-wrap{overflow-x:auto}
.chart-wrap svg{display:block}
.chart-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(520px,1fr));gap:16px}
/* Legend */
.legend{display:flex;flex-wrap:wrap;gap:10px;margin-top:8px;font-size:12px}
.legend-item{display:flex;align-items:center;gap:4px}
.legend-swatch{width:14px;height:4px;border-radius:2px}
/* Table */
.table-wrap{overflow-x:auto;max-height:500px;overflow-y:auto}
table{width:100%;border-collapse:collapse;font-size:13px}
th,td{padding:6px 10px;text-align:left;border-bottom:1px solid var(--border);white-space:nowrap}
th{position:sticky;top:0;background:var(--card);cursor:pointer;user-select:none;font-weight:600;font-size:12px;text-transform:uppercase;letter-spacing:.3px;color:var(--text2)}
th:hover{color:var(--text)}
th .sort-arrow{font-size:10px;margin-left:2px}
td{font-family:var(--mono);font-size:12px}
tr:hover td{background:rgba(118,185,0,.04)}
/* Tooltip */
#tooltip{position:fixed;z-index:200;pointer-events:none;background:rgba(26,26,26,.92);color:#fff;font-size:12px;font-family:var(--mono);padding:8px 12px;border-radius:8px;line-height:1.6;max-width:300px;display:none;backdrop-filter:blur(4px)}
/* Footer */
footer{text-align:center;padding:20px;font-size:12px;color:#999}
/* Responsive */
@media(max-width:600px){
.chart-grid{grid-template-columns:1fr}
.kpi-grid{grid-template-columns:repeat(auto-fill,minmax(160px,1fr))}
}
</style>
</head>
<body>
<nav id="nav">
<span class="brand">KVBM Bench</span>
<a href="#summary" data-sec="summary">Summary</a>
<a href="#concurrency" data-sec="concurrency">Concurrency</a>
<a href="#pagesize" data-sec="pagesize">Page Size</a>
<a href="#bounce" data-sec="bounce">Bounce</a>
<a href="#g2g3" data-sec="g2g3">G2/G3</a>
<a href="#gds" data-sec="gds">GDS vs Staged</a>
<a href="#bidir" data-sec="bidir">Bidirectional</a>
<a href="#latency" data-sec="latency">Latency</a>
<a href="#rawdata" data-sec="rawdata">Data</a>
<span class="spacer"></span>
<select id="gpuFilter"><option value="all">All GPUs</option></select>
</nav>
<div class="header" id="headerZone">
<h1>KVBM Benchmark Results Viewer</h1>
<p>Load bench_engine JSON Lines output to visualize performance characteristics</p>
<div class="drop-zone" id="dropZone">
<h3>Drop .jsonl file here</h3>
<p>or use the controls below</p>
</div>
<div class="load-row">
<input type="file" id="filePicker" accept=".jsonl,.json,.txt">
<textarea id="pasteArea" placeholder="Paste JSON Lines here..."></textarea>
<button class="btn btn-green" id="pasteBtn">Load</button>
</div>
<div class="loaded-bar" id="loadedBar" style="display:none">
<span class="fname" id="loadedName">file.jsonl</span>
<span class="count" id="loadedCount">0 records</span>
<span class="spacer" style="flex:1"></span>
<button class="btn btn-sm btn-green" id="reloadBtn">Load Different</button>
</div>
</div>
<main id="main">
<section id="summary"><h2>Summary</h2><p class="desc">Key performance indicators across all tests</p><div class="kpi-grid" id="kpiGrid"></div></section>
<section id="concurrency"><h2>Concurrency Scaling</h2><p class="desc">Bandwidth vs concurrency &mdash; find the saturation point for each transfer type</p><div class="pills" id="concPills"></div><div class="chart-grid" id="concCharts"></div></section>
<section id="pagesize"><h2>Page Size Efficiency</h2><p class="desc">Bandwidth vs page_size &mdash; block size amortization</p><div class="pills" id="psPills"></div><div class="chart-wrap" id="psChart"></div></section>
<section id="bounce"><h2>Bounce Buffer Analysis</h2><p class="desc">Bandwidth vs bounce_blocks &mdash; double-buffering effectiveness</p><div class="chart-wrap" id="bounceChart"></div></section>
<section id="g2g3"><h2>G2/G3 Raw Bandwidth</h2><p class="desc">NVMe read vs write bandwidth at each concurrency level</p><div class="chart-wrap" id="g2g3Chart"></div></section>
<section id="gds"><h2>GDS vs Staged</h2><p class="desc">GPUDirect Storage bypass compared to best staged transfer</p><div class="chart-wrap" id="gdsChart"></div></section>
<section id="bidir"><h2>Bidirectional Contention</h2><p class="desc">Isolated vs contended bandwidth to measure contention degradation</p><div class="chart-wrap" id="bidirChart"></div></section>
<section id="latency"><h2>Latency Distribution</h2><p class="desc">Horizontal box plots showing min / p50 / p95 / p99 / max per test</p><div class="chart-wrap" id="latChart"></div></section>
<section id="rawdata"><h2>Raw Data</h2><p class="desc">All loaded records &mdash; click column headers to sort</p><div class="card"><div class="table-wrap" id="tableWrap"></div></div></section>
</main>
<div id="tooltip"></div>
<footer>KVBM Bench Viewer &mdash; NVIDIA CORPORATION &amp; AFFILIATES</footer>
<script>
"use strict";
/* ── Color map ── */
const COLOR={
g1_to_g2:"#76B900",g2_to_g1:"#4A7A00",g2_to_g3:"#0074DF",g3_to_g2:"#1565C0",
g1_to_g3_staged:"#EF9100",g3_to_g1_staged:"#C67800",
g1_to_g3_gds:"#E52020",g3_to_g1_gds:"#C62828",
bidir_g1_to_g2:"#76B900",bidir_g2_to_g1:"#4A7A00"
};
const DASHED=new Set(["bidir_g1_to_g2","bidir_g2_to_g1"]);
function testColor(t){return COLOR[t]||"#888"}
/* ── State ── */
let DATA=[];
let FILTERED=[];
let sortCol=null,sortAsc=true;
/* ── DOM helpers ── */
const $=s=>document.querySelector(s);
const $$=s=>[...document.querySelectorAll(s)];
function el(tag,attrs,children){
const e=document.createElement(tag);
if(attrs)for(const[k,v]of Object.entries(attrs)){
if(k==="style"&&typeof v==="object")Object.assign(e.style,v);
else if(k.startsWith("on"))e.addEventListener(k.slice(2),v);
else e.setAttribute(k,v);
}
if(children){
if(typeof children==="string")e.textContent=children;
else if(Array.isArray(children))children.forEach(c=>{if(c)e.appendChild(typeof c==="string"?document.createTextNode(c):c)});
else e.appendChild(children);
}
return e;
}
/* ── SVG mini-lib ── */
const SVG_NS="http://www.w3.org/2000/svg";
function svgEl(tag,attrs){
const e=document.createElementNS(SVG_NS,tag);
if(attrs)for(const[k,v]of Object.entries(attrs))e.setAttribute(k,v);
return e;
}
function createSVG(w,h){
const s=svgEl("svg",{width:w,height:h,viewBox:`0 0 ${w} ${h}`});
s.style.overflow="visible";
return s;
}
function scaleLinear(domain,range){
const[d0,d1]=domain,[r0,r1]=range;
const m=(d1===d0)?0:(r1-r0)/(d1-d0);
function fn(v){return r0+m*(v-d0)}
fn.domain=domain;fn.range=range;
fn.ticks=function(n=6){
if(d0===d1)return[d0];
const step=niceStep((d1-d0)/n);
const start=Math.ceil(d0/step)*step;
const arr=[];
for(let v=start;v<=d1+step*0.001;v+=step)arr.push(+v.toPrecision(12));
return arr;
};
fn.invert=function(px){return d0===d1?d0:d0+(px-r0)/m};
return fn;
}
function niceStep(raw){
const mag=Math.pow(10,Math.floor(Math.log10(raw)));
const res=raw/mag;
if(res<=1)return mag;if(res<=2)return 2*mag;if(res<=5)return 5*mag;return 10*mag;
}
/* Margins for charts */
const M={top:28,right:20,bottom:44,left:60};
function drawXAxis(svg,scale,y,label,fmt){
const ticks=scale.ticks(7);
const g=svgEl("g");
g.appendChild(svgEl("line",{x1:scale.range[0],x2:scale.range[1],y1:y,y2:y,stroke:"#ddd","stroke-width":1}));
ticks.forEach(t=>{
const x=scale(t);
g.appendChild(svgEl("line",{x1:x,x2:x,y1:y,y2:y+5,stroke:"#bbb","stroke-width":1}));
const txt=svgEl("text",{x,y:y+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
txt.textContent=fmt?fmt(t):fmtNum(t);
g.appendChild(txt);
});
if(label){
const lbl=svgEl("text",{x:(scale.range[0]+scale.range[1])/2,y:y+36,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
lbl.textContent=label;g.appendChild(lbl);
}
svg.appendChild(g);
}
function drawYAxis(svg,scale,x,label){
const ticks=scale.ticks(6);
const g=svgEl("g");
g.appendChild(svgEl("line",{x1:x,x2:x,y1:scale.range[0],y2:scale.range[1],stroke:"#ddd","stroke-width":1}));
ticks.forEach(t=>{
const y=scale(t);
g.appendChild(svgEl("line",{x1:x-5,x2:x,y1:y,y2:y,stroke:"#bbb","stroke-width":1}));
/* grid line */
g.appendChild(svgEl("line",{x1:x,x2:x+scale.range[0]-x+400,y1:y,y2:y,stroke:"#f0f0f0","stroke-width":1}));
const txt=svgEl("text",{x:x-8,y:y+4,"text-anchor":"end",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
txt.textContent=fmtNum(t);g.appendChild(txt);
});
if(label){
const lbl=svgEl("text",{x:0,y:0,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)",transform:`translate(${x-44},${(scale.range[0]+scale.range[1])/2}) rotate(-90)`});
lbl.textContent=label;g.appendChild(lbl);
}
svg.appendChild(g);
}
function drawGridY(svg,scaleY,x0,x1){
const ticks=scaleY.ticks(6);
ticks.forEach(t=>{
const y=scaleY(t);
svg.appendChild(svgEl("line",{x1:x0,x2:x1,y1:y,y2:y,stroke:"#f0f0f0","stroke-width":1}));
});
}
function drawLine(svg,points,color,dashed){
if(points.length===0)return;
const d=points.map((p,i)=>(i===0?"M":"L")+p[0]+","+p[1]).join(" ");
const attrs={d,fill:"none",stroke:color,"stroke-width":2,"stroke-linejoin":"round","stroke-linecap":"round"};
if(dashed)attrs["stroke-dasharray"]="6,4";
svg.appendChild(svgEl("path",attrs));
points.forEach(p=>{
const c=svgEl("circle",{cx:p[0],cy:p[1],r:4,fill:color,stroke:"#fff","stroke-width":1.5,style:"cursor:pointer"});
c._tip=p[2];
c.addEventListener("mouseenter",e=>showTip(e,p[2]));
c.addEventListener("mouseleave",hideTip);
svg.appendChild(c);
});
}
/* Tooltip */
const tooltip=document.getElementById("tooltip");
function showTip(e,html){
tooltip.innerHTML=html;tooltip.style.display="block";
positionTip(e);
}
function positionTip(e){
const tw=tooltip.offsetWidth,th=tooltip.offsetHeight;
let x=e.clientX+12,y=e.clientY-th-8;
if(x+tw>window.innerWidth)x=e.clientX-tw-12;
if(y<0)y=e.clientY+16;
tooltip.style.left=x+"px";tooltip.style.top=y+"px";
}
function hideTip(){tooltip.style.display="none"}
document.addEventListener("mousemove",e=>{if(tooltip.style.display==="block")positionTip(e)});
function fmtNum(v){
if(v>=1000)return v.toLocaleString(undefined,{maximumFractionDigits:1});
if(v===0)return"0";
if(Math.abs(v)<0.01)return v.toExponential(1);
return +v.toFixed(2)+"";
}
function fmtBW(v){return v.toFixed(2)+" GB/s"}
function fmtUs(v){return v.toFixed(1)+" us"}
function fmtLabel(t){return t.replace(/_/g," ")}
/* ── Data loading ── */
function parseJSONL(text){
const lines=text.trim().split("\n");
const out=[];
for(const line of lines){
const s=line.trim();
if(!s)continue;
try{out.push(JSON.parse(s))}catch(e){/* skip */}
}
return out;
}
function loadData(records,name){
DATA=records;
populateGPUFilter();
applyFilter();
$("#main").style.display="block";
/* collapse loader */
$("#dropZone").style.display="none";
$(".load-row").style.display="none";
$("#loadedBar").style.display="flex";
$("#loadedName").textContent=name||"pasted data";
$("#loadedCount").textContent=records.length+" records";
}
/* Drop zone */
const dz=$("#dropZone");
dz.addEventListener("dragover",e=>{e.preventDefault();dz.classList.add("over")});
dz.addEventListener("dragleave",()=>dz.classList.remove("over"));
dz.addEventListener("drop",e=>{
e.preventDefault();dz.classList.remove("over");
const f=e.dataTransfer.files[0];
if(f)readFile(f);
});
dz.addEventListener("click",()=>$("#filePicker").click());
$("#filePicker").addEventListener("change",e=>{
const f=e.target.files[0];
if(f)readFile(f);
});
function readFile(f){
const r=new FileReader();
r.onload=()=>loadData(parseJSONL(r.result),f.name);
r.readAsText(f);
}
$("#pasteBtn").addEventListener("click",()=>{
const txt=$("#pasteArea").value;
if(txt.trim())loadData(parseJSONL(txt),"pasted");
});
$("#reloadBtn").addEventListener("click",()=>{
$("#dropZone").style.display="";$(".load-row").style.display="";
$("#loadedBar").style.display="none";
});
/* GPU filter */
function populateGPUFilter(){
const sel=$("#gpuFilter");
sel.innerHTML='<option value="all">All GPUs</option>';
const ids=[...new Set(DATA.map(d=>d.device_id))].sort((a,b)=>a-b);
ids.forEach(id=>{
const o=document.createElement("option");o.value=id;o.textContent="GPU "+id;
sel.appendChild(o);
});
}
$("#gpuFilter").addEventListener("change",()=>applyFilter());
function applyFilter(){
const sel=$("#gpuFilter").value;
FILTERED=sel==="all"?DATA.slice():DATA.filter(d=>String(d.device_id)===sel);
renderAll();
}
/* ── Render orchestrator ── */
function renderAll(){
renderKPIs();
renderConcurrency();
renderPageSize();
renderBounce();
renderG2G3();
renderGDS();
renderBidir();
renderLatency();
renderTable();
hideEmptySections();
}
/* ── Helpers ── */
function unique(arr,key){return[...new Set(arr.map(d=>d[key]))].sort((a,b)=>a-b)}
function groupBy(arr,key){
const m={};
arr.forEach(d=>{const k=d[key];(m[k]=m[k]||[]).push(d)});
return m;
}
/* ── Section: KPIs ── */
function renderKPIs(){
const g=$("#kpiGrid");g.innerHTML="";
if(!FILTERED.length)return;
const maxBW=Math.max(...FILTERED.map(d=>d.bandwidth_gbs||0));
const maxAgg=Math.max(...FILTERED.map(d=>d.aggregate_bandwidth_gbs||0));
const tests=[...new Set(FILTERED.map(d=>d.test))];
const devices=[...new Set(FILTERED.map(d=>d.device_id))];
const avgLat=FILTERED.reduce((s,d)=>s+(d.latency_us?.mean_us||0),0)/FILTERED.length;
const kpis=[
{label:"Peak Single-Stream BW",value:fmtBW(maxBW),sub:FILTERED.find(d=>d.bandwidth_gbs===maxBW)?.test,cls:""},
{label:"Peak Aggregate BW",value:fmtBW(maxAgg),sub:"across all streams",cls:""},
{label:"Tests",value:tests.length,sub:tests.slice(0,3).join(", ")+(tests.length>3?" ...":""),cls:"blue"},
{label:"GPUs",value:devices.length,sub:"device IDs: "+devices.join(", "),cls:"blue"},
{label:"Total Records",value:FILTERED.length,sub:"loaded",cls:"amber"},
{label:"Avg Latency",value:fmtUs(avgLat),sub:"mean across all tests",cls:"amber"},
];
kpis.forEach(k=>{
const d=el("div",{class:"card kpi"+(k.cls?" "+k.cls:"")});
d.appendChild(el("div",{class:"label"},k.label));
d.appendChild(el("div",{class:"value"},String(k.value)));
d.appendChild(el("div",{class:"sub"},k.sub||""));
g.appendChild(d);
});
}
/* ── Section: Concurrency Scaling ── */
let concPageFilter=null;
function renderConcurrency(){
const wrap=$("#concCharts");wrap.innerHTML="";
const pillsEl=$("#concPills");pillsEl.innerHTML="";
const pages=unique(FILTERED,"page_size");
if(pages.length>1){
pages.forEach(ps=>{
const p=el("button",{class:"pill"+(concPageFilter===ps?" active":""),onclick:()=>{concPageFilter=concPageFilter===ps?null:ps;renderConcurrency()}},String(ps));
pillsEl.appendChild(p);
});
}
let data=FILTERED;
if(concPageFilter!==null)data=data.filter(d=>d.page_size===concPageFilter);
const byTest=groupBy(data,"test");
const tests=Object.keys(byTest);
if(!tests.length)return;
const W=540,H=300;
const svg=createSVG(W,H);
const xVals=unique(data,"concurrency");
const allBW=data.map(d=>d.aggregate_bandwidth_gbs||d.bandwidth_gbs||0);
const xS=scaleLinear([Math.min(...xVals),Math.max(...xVals)],[M.left,W-M.right]);
const yS=scaleLinear([0,Math.max(...allBW)*1.1],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,W-M.right);
drawXAxis(svg,xS,H-M.bottom,"Concurrency");
drawYAxis(svg,yS,M.left,"BW (GB/s)");
tests.forEach(t=>{
const pts=byTest[t].slice().sort((a,b)=>a.concurrency-b.concurrency);
const mapped=pts.map(d=>[xS(d.concurrency),yS(d.aggregate_bandwidth_gbs||d.bandwidth_gbs),
`<b>${fmtLabel(t)}</b><br>conc: ${d.concurrency}<br>BW: ${fmtBW(d.aggregate_bandwidth_gbs||d.bandwidth_gbs)}<br>page: ${d.page_size}`]);
drawLine(svg,mapped,testColor(t),DASHED.has(t));
});
const card=el("div",{class:"card"});
card.appendChild(svg);
card.appendChild(makeLegend(tests));
wrap.appendChild(card);
}
function makeLegend(tests){
const lg=el("div",{class:"legend"});
tests.forEach(t=>{
const item=el("div",{class:"legend-item"});
const sw=el("span",{class:"legend-swatch",style:{background:testColor(t),width:"14px",height:DASHED.has(t)?"2px":"4px",borderTop:DASHED.has(t)?`2px dashed ${testColor(t)}`:"none",background:DASHED.has(t)?"transparent":testColor(t)}});
item.appendChild(sw);
item.appendChild(el("span",{},fmtLabel(t)));
lg.appendChild(item);
});
return lg;
}
/* ── Section: Page Size ── */
let psConcFilter=null;
function renderPageSize(){
const wrap=$("#psChart");wrap.innerHTML="";
const pillsEl=$("#psPills");pillsEl.innerHTML="";
const concs=unique(FILTERED,"concurrency");
if(concs.length>1){
concs.forEach(c=>{
const p=el("button",{class:"pill"+(psConcFilter===c?" active":""),onclick:()=>{psConcFilter=psConcFilter===c?null:c;renderPageSize()}},String(c));
pillsEl.appendChild(p);
});
}
let data=FILTERED;
if(psConcFilter!==null)data=data.filter(d=>d.concurrency===psConcFilter);
const byTest=groupBy(data,"test");
const tests=Object.keys(byTest);
const pages=unique(data,"page_size");
if(!tests.length||!pages.length)return;
const barW=20,groupGap=24,innerGap=2;
const groupWidth=tests.length*barW+(tests.length-1)*innerGap;
const W=M.left+M.right+pages.length*(groupWidth+groupGap);
const H=320;
const svg=createSVG(Math.max(W,400),H);
const allBW=data.map(d=>d.bandwidth_gbs||0);
const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
drawYAxis(svg,yS,M.left,"BW (GB/s)");
pages.forEach((ps,pi)=>{
const gx=M.left+pi*(groupWidth+groupGap)+groupGap/2;
/* x-label */
const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
lbl.textContent=ps;svg.appendChild(lbl);
tests.forEach((t,ti)=>{
const rec=byTest[t]?.find(d=>d.page_size===ps);
if(!rec)return;
const bw=rec.bandwidth_gbs||0;
const bx=gx+ti*(barW+innerGap);
const by=yS(bw);
const bh=yS(0)-by;
const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(bh,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>page_size: ${ps}<br>BW: ${fmtBW(bw)}`));
rect.addEventListener("mouseleave",hideTip);
svg.appendChild(rect);
});
});
/* x-axis label */
const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
xLbl.textContent="page_size";svg.appendChild(xLbl);
const card=el("div",{class:"card"});
card.appendChild(svg);card.appendChild(makeLegend(tests));
wrap.appendChild(card);
}
/* ── Section: Bounce Buffer ── */
function renderBounce(){
const wrap=$("#bounceChart");wrap.innerHTML="";
const data=FILTERED.filter(d=>d.bounce_blocks!=null);
if(!data.length){return}
const byTest=groupBy(data,"test");
const tests=Object.keys(byTest);
const W=540,H=300;
const svg=createSVG(W,H);
const xVals=unique(data,"bounce_blocks");
const allBW=data.map(d=>d.bandwidth_gbs||0);
const xS=scaleLinear([Math.min(...xVals),Math.max(...xVals)],[M.left,W-M.right]);
const yS=scaleLinear([0,Math.max(...allBW)*1.1],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,W-M.right);
drawXAxis(svg,xS,H-M.bottom,"bounce_blocks");
drawYAxis(svg,yS,M.left,"BW (GB/s)");
tests.forEach(t=>{
const pts=byTest[t].slice().sort((a,b)=>a.bounce_blocks-b.bounce_blocks);
const mapped=pts.map(d=>[xS(d.bounce_blocks),yS(d.bandwidth_gbs),
`<b>${fmtLabel(t)}</b><br>bounce: ${d.bounce_blocks}<br>BW: ${fmtBW(d.bandwidth_gbs)}`]);
drawLine(svg,mapped,testColor(t),DASHED.has(t));
});
const card=el("div",{class:"card"});
card.appendChild(svg);card.appendChild(makeLegend(tests));
wrap.appendChild(card);
}
/* ── Section: G2/G3 ── */
function renderG2G3(){
const wrap=$("#g2g3Chart");wrap.innerHTML="";
const tests=["g2_to_g3","g3_to_g2"];
const data=FILTERED.filter(d=>tests.includes(d.test));
if(!data.length)return;
const concs=unique(data,"concurrency");
const byTest=groupBy(data,"test");
const barW=24,innerGap=4,groupGap=30;
const groupWidth=tests.length*barW+(tests.length-1)*innerGap;
const W=M.left+M.right+concs.length*(groupWidth+groupGap);
const H=320;
const svg=createSVG(Math.max(W,400),H);
const allBW=data.map(d=>d.bandwidth_gbs||0);
const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
drawYAxis(svg,yS,M.left,"BW (GB/s)");
concs.forEach((c,ci)=>{
const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
lbl.textContent=c;svg.appendChild(lbl);
tests.forEach((t,ti)=>{
const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
if(!rec)return;
const bw=rec.bandwidth_gbs||0;
const bx=gx+ti*(barW+innerGap);
const by=yS(bw);
const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
rect.addEventListener("mouseleave",hideTip);
svg.appendChild(rect);
});
});
const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
xLbl.textContent="Concurrency";svg.appendChild(xLbl);
const card=el("div",{class:"card"});
card.appendChild(svg);card.appendChild(makeLegend(tests));
wrap.appendChild(card);
}
/* ── Section: GDS vs Staged ── */
function renderGDS(){
const wrap=$("#gdsChart");wrap.innerHTML="";
const gdsTests=["g1_to_g3_gds","g3_to_g1_gds"];
const stagedTests=["g1_to_g3_staged","g3_to_g1_staged"];
const allTests=[...gdsTests,...stagedTests];
const data=FILTERED.filter(d=>allTests.includes(d.test));
if(!data.filter(d=>gdsTests.includes(d.test)).length)return;
const concs=unique(data,"concurrency");
const byTest=groupBy(data,"test");
const present=allTests.filter(t=>byTest[t]?.length);
const barW=22,innerGap=3,groupGap=28;
const groupWidth=present.length*barW+(present.length-1)*innerGap;
const W=M.left+M.right+concs.length*(groupWidth+groupGap);
const H=320;
const svg=createSVG(Math.max(W,400),H);
const allBW=data.map(d=>d.bandwidth_gbs||0);
const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
drawYAxis(svg,yS,M.left,"BW (GB/s)");
concs.forEach((c,ci)=>{
const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
lbl.textContent=c;svg.appendChild(lbl);
present.forEach((t,ti)=>{
const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
if(!rec)return;
const bw=rec.bandwidth_gbs||0;
const bx=gx+ti*(barW+innerGap);
const by=yS(bw);
const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
rect.addEventListener("mouseleave",hideTip);
svg.appendChild(rect);
});
});
const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
xLbl.textContent="Concurrency";svg.appendChild(xLbl);
const card=el("div",{class:"card"});
card.appendChild(svg);card.appendChild(makeLegend(present));
wrap.appendChild(card);
}
/* ── Section: Bidirectional ── */
function renderBidir(){
const wrap=$("#bidirChart");wrap.innerHTML="";
const bidirTests=["bidir_g1_to_g2","bidir_g2_to_g1"];
const isoTests=["g1_to_g2","g2_to_g1"];
const data=FILTERED.filter(d=>[...bidirTests,...isoTests].includes(d.test));
if(!data.filter(d=>bidirTests.includes(d.test)).length)return;
const concs=unique(data,"concurrency");
const byTest=groupBy(data,"test");
const present=[...isoTests,...bidirTests].filter(t=>byTest[t]?.length);
const barW=22,innerGap=3,groupGap=28;
const groupWidth=present.length*barW+(present.length-1)*innerGap;
const W=M.left+M.right+concs.length*(groupWidth+groupGap);
const H=320;
const svg=createSVG(Math.max(W,400),H);
const allBW=data.map(d=>d.bandwidth_gbs||0);
const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
drawYAxis(svg,yS,M.left,"BW (GB/s)");
concs.forEach((c,ci)=>{
const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
lbl.textContent=c;svg.appendChild(lbl);
present.forEach((t,ti)=>{
const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
if(!rec)return;
const bw=rec.bandwidth_gbs||0;
const bx=gx+ti*(barW+innerGap);
const by=yS(bw);
const hatch=DASHED.has(t);
const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),opacity:hatch?"0.6":"1",style:"cursor:pointer"});
if(hatch){
rect.setAttribute("stroke",testColor(t));
rect.setAttribute("stroke-width","2");
rect.setAttribute("stroke-dasharray","4,3");
}
rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
rect.addEventListener("mouseleave",hideTip);
svg.appendChild(rect);
});
});
const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
xLbl.textContent="Concurrency";svg.appendChild(xLbl);
const card=el("div",{class:"card"});
card.appendChild(svg);card.appendChild(makeLegend(present));
wrap.appendChild(card);
}
/* ── Section: Latency Distribution ── */
function renderLatency(){
const wrap=$("#latChart");wrap.innerHTML="";
const data=FILTERED.filter(d=>d.latency_us);
if(!data.length)return;
/* Aggregate per test: take the record with max mean_us per test for interesting spread,
or just show all unique test entries. We'll pick one representative per (test, concurrency, page_size). */
const byTest=groupBy(data,"test");
const tests=Object.keys(byTest);
const rowH=36,padY=8;
const W=700,H=M.top+tests.length*(rowH+padY)+M.bottom;
const svg=createSVG(W,H);
/* Find global latency range */
let gMin=Infinity,gMax=0;
tests.forEach(t=>{
byTest[t].forEach(d=>{
const l=d.latency_us;
if(l.min_us<gMin)gMin=l.min_us;
if(l.max_us>gMax)gMax=l.max_us;
});
});
const xS=scaleLinear([0,gMax*1.05],[M.left+100,W-M.right]);
/* x-axis */
drawXAxis(svg,xS,H-M.bottom,"Latency (us)");
tests.forEach((t,i)=>{
/* Use the record closest to median concurrency */
const recs=byTest[t].slice().sort((a,b)=>(a.latency_us.mean_us)-(b.latency_us.mean_us));
const mid=recs[Math.floor(recs.length/2)];
const l=mid.latency_us;
const cy=M.top+i*(rowH+padY)+rowH/2;
const color=testColor(t);
/* label */
const lbl=svgEl("text",{x:M.left+94,y:cy+4,"text-anchor":"end",fill:color,"font-size":"12","font-weight":"600",style:"font-family:var(--font)"});
lbl.textContent=fmtLabel(t);svg.appendChild(lbl);
/* whisker: min to max */
svg.appendChild(svgEl("line",{x1:xS(l.min_us),x2:xS(l.max_us),y1:cy,y2:cy,stroke:color,"stroke-width":1}));
/* caps */
svg.appendChild(svgEl("line",{x1:xS(l.min_us),x2:xS(l.min_us),y1:cy-8,y2:cy+8,stroke:color,"stroke-width":1.5}));
svg.appendChild(svgEl("line",{x1:xS(l.max_us),x2:xS(l.max_us),y1:cy-8,y2:cy+8,stroke:color,"stroke-width":1.5}));
/* box: p50 to p95 */
const bx=xS(l.p50_us),bx2=xS(l.p95_us);
const boxRect=svgEl("rect",{x:bx,y:cy-10,width:Math.max(bx2-bx,2),height:20,rx:4,fill:color,opacity:"0.25",stroke:color,"stroke-width":1.5});
svg.appendChild(boxRect);
/* p99 mark */
const p99x=xS(l.p99_us);
svg.appendChild(svgEl("line",{x1:p99x,x2:p99x,y1:cy-10,y2:cy+10,stroke:color,"stroke-width":2,"stroke-dasharray":"3,2"}));
/* median line */
const medX=xS(l.p50_us);
svg.appendChild(svgEl("line",{x1:medX,x2:medX,y1:cy-10,y2:cy+10,stroke:color,"stroke-width":2.5}));
/* hover target */
const hover=svgEl("rect",{x:xS(l.min_us)-2,y:cy-12,width:xS(l.max_us)-xS(l.min_us)+4,height:24,fill:"transparent",style:"cursor:pointer"});
hover.addEventListener("mouseenter",e=>showTip(e,
`<b>${fmtLabel(t)}</b> (conc=${mid.concurrency}, page=${mid.page_size})<br>`+
`min: ${fmtUs(l.min_us)}<br>p50: ${fmtUs(l.p50_us)}<br>p95: ${fmtUs(l.p95_us)}<br>p99: ${fmtUs(l.p99_us)}<br>max: ${fmtUs(l.max_us)}`));
hover.addEventListener("mouseleave",hideTip);
svg.appendChild(hover);
});
/* Legend for box plot parts */
const card=el("div",{class:"card"});
card.appendChild(svg);
const lgDiv=el("div",{class:"legend",style:{marginTop:"6px",fontSize:"11px",color:"#999"}});
lgDiv.innerHTML="<span>Whiskers: min/max</span> &middot; <span>Box: p50&ndash;p95</span> &middot; <span>Dashed: p99</span> &middot; <span>Bold line: p50</span>";
card.appendChild(lgDiv);
wrap.appendChild(card);
}
/* ── Section: Raw Data Table ── */
const TABLE_COLS=[
{key:"test",label:"Test",fmt:v=>v},
{key:"device_id",label:"GPU",fmt:v=>v},
{key:"page_size",label:"Page Size",fmt:v=>v},
{key:"blocks_per_batch",label:"Blocks/Batch",fmt:v=>v},
{key:"concurrency",label:"Conc",fmt:v=>v},
{key:"bounce_blocks",label:"Bounce",fmt:v=>v==null?"-":v},
{key:"bandwidth_gbs",label:"BW (GB/s)",fmt:v=>v?.toFixed(2)??""},
{key:"aggregate_bandwidth_gbs",label:"Agg BW",fmt:v=>v?.toFixed(2)??""},
{key:"_lat_mean",label:"Lat mean",fmt:v=>v?.toFixed(1)??""},
{key:"_lat_p50",label:"Lat p50",fmt:v=>v?.toFixed(1)??""},
{key:"_lat_p99",label:"Lat p99",fmt:v=>v?.toFixed(1)??""},
{key:"iterations",label:"Iters",fmt:v=>v},
];
function getVal(d,key){
if(key==="_lat_mean")return d.latency_us?.mean_us;
if(key==="_lat_p50")return d.latency_us?.p50_us;
if(key==="_lat_p99")return d.latency_us?.p99_us;
return d[key];
}
function renderTable(){
const wrap=$("#tableWrap");wrap.innerHTML="";
if(!FILTERED.length)return;
let rows=FILTERED.slice();
if(sortCol!==null){
rows.sort((a,b)=>{
let va=getVal(a,sortCol),vb=getVal(b,sortCol);
if(va==null)va=sortAsc?Infinity:-Infinity;
if(vb==null)vb=sortAsc?Infinity:-Infinity;
if(typeof va==="string")return sortAsc?va.localeCompare(vb):vb.localeCompare(va);
return sortAsc?(va-vb):(vb-va);
});
}
const table=el("table");
const thead=el("thead");
const tr=el("tr");
TABLE_COLS.forEach(c=>{
const th=el("th",{onclick:()=>{
if(sortCol===c.key)sortAsc=!sortAsc;else{sortCol=c.key;sortAsc=true;}
renderTable();
}});
th.innerHTML=c.label+(sortCol===c.key?` <span class="sort-arrow">${sortAsc?"\u25B2":"\u25BC"}</span>`:"");
tr.appendChild(th);
});
thead.appendChild(tr);table.appendChild(thead);
const tbody=el("tbody");
rows.forEach(d=>{
const row=el("tr");
TABLE_COLS.forEach(c=>{
const td=el("td");
const v=getVal(d,c.key);
td.textContent=c.fmt(v);
if(c.key==="test")td.style.color=testColor(v);
row.appendChild(td);
});
tbody.appendChild(row);
});
table.appendChild(tbody);
wrap.appendChild(table);
}
/* ── Section auto-hide ── */
function hideEmptySections(){
const checks={
bounce:FILTERED.some(d=>d.bounce_blocks!=null),
g2g3:FILTERED.some(d=>["g2_to_g3","g3_to_g2"].includes(d.test)),
gds:FILTERED.some(d=>["g1_to_g3_gds","g3_to_g1_gds"].includes(d.test)),
bidir:FILTERED.some(d=>d.test.startsWith("bidir_")),
};
for(const[id,show]of Object.entries(checks)){
const sec=document.getElementById(id);
if(sec)sec.style.display=show?"":"none";
}
}
/* ── Nav IntersectionObserver ── */
const navLinks=$$("nav a[data-sec]");
const secEls=navLinks.map(a=>document.getElementById(a.dataset.sec)).filter(Boolean);
const obsOpts={rootMargin:"-56px 0px -60% 0px",threshold:0};
const observer=new IntersectionObserver(entries=>{
entries.forEach(en=>{
if(en.isIntersecting){
navLinks.forEach(a=>a.classList.toggle("active",a.dataset.sec===en.target.id));
}
});
},obsOpts);
secEls.forEach(s=>observer.observe(s));
</script>
</body>
</html>
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Runs S3 integration tests against a local MinIO instance.
# Usage: bash lib/kvbm-engine/scripts/test-s3.sh
set -euo pipefail
MINIO_CONTAINER="kvbm-minio-test-$$-$RANDOM"
MINIO_PORT="${MINIO_PORT:-9876}"
MINIO_ROOT_USER="minioadmin"
MINIO_ROOT_PASSWORD="minioadmin"
cleanup() {
echo "Cleaning up MinIO container: $MINIO_CONTAINER"
docker stop "$MINIO_CONTAINER" 2>/dev/null || true
}
trap cleanup EXIT
echo "Starting MinIO container: $MINIO_CONTAINER on port $MINIO_PORT"
docker run --rm -d \
--name "$MINIO_CONTAINER" \
-p "${MINIO_PORT}:9000" \
-e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \
-e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \
minio/minio:latest server /data
# Wait for MinIO to be ready
echo "Waiting for MinIO to be ready..."
for i in $(seq 1 30); do
if curl -sf "http://localhost:${MINIO_PORT}/minio/health/live" >/dev/null 2>&1; then
echo "MinIO is ready."
break
fi
if [ "$i" -eq 30 ]; then
echo "ERROR: MinIO failed to start within 30 seconds"
exit 1
fi
sleep 1
done
export S3_TEST_ENDPOINT="http://localhost:${MINIO_PORT}"
export AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}"
export AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}"
export AWS_DEFAULT_REGION="us-east-1"
echo "Running S3 integration tests..."
timeout 120 cargo test -p kvbm-engine --features testing-s3 -- s3_integration
exit_code=$?
echo "Tests finished with exit code: $exit_code"
exit $exit_code
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! NCCL bootstrap utilities for creating communicators from scratch.
//!
//! This module provides helpers for initializing NCCL communicators in standalone
//! Rust applications and tests, where no external launcher (like PyTorch) provides
//! pre-initialized communicators.
//!
//! # Two Construction Paths
//!
//! NCCL communicators can be created via two paths:
//!
//! 1. **Bootstrap (this module)**: For tests and standalone Rust applications.
//! Rank 0 generates a unique ID, distributes it to other ranks, and all
//! ranks collectively call `ncclCommInitRank`.
//!
//! 2. **Borrowed handles**: For production use with PyTorch, vLLM, or TensorRT-LLM.
//! The external runtime creates the communicator, and Rust code borrows it
//! via FFI. See [`NcclCollectives::from_borrowed`].
//!
//! # Example: Multi-process Bootstrap
//!
//! ```rust,ignore
//! use kvbm::v2::distributed::collectives::NcclBootstrap;
//!
//! // Rank 0: Generate and share the unique ID
//! if rank == 0 {
//! let bootstrap = NcclBootstrap::generate(world_size)?;
//! let bytes = bootstrap.serialize();
//! // Send `bytes` to other ranks via your IPC mechanism
//! }
//!
//! // All ranks: Initialize communicator
//! let bootstrap = if rank == 0 {
//! NcclBootstrap::generate(world_size)?
//! } else {
//! let bytes = receive_from_rank_0();
//! NcclBootstrap::deserialize(&bytes)?
//! };
//!
//! let comm = bootstrap.init_communicator(rank, stream)?;
//! ```
use std::ffi::c_char;
use std::mem::MaybeUninit;
/// Platform-neutral byte type for NCCL's `ncclUniqueId::internal` field.
/// `c_char` is `i8` on x86_64 and `u8` on aarch64.
type NcclByte = c_char;
use anyhow::{Context, Result};
use cudarc::driver::sys::CUstream;
use cudarc::nccl::sys::{
ncclComm_t, ncclCommInitRank, ncclGetUniqueId, ncclResult_t, ncclUniqueId,
};
/// Bootstrap for creating NCCL communicators from scratch.
///
/// Used by tests and standalone Rust applications where NCCL communicators
/// need to be created without an external launcher.
///
/// # Workflow
///
/// 1. Rank 0 calls [`NcclBootstrap::generate`] to create the unique ID
/// 2. Rank 0 serializes via [`NcclBootstrap::serialize`] and sends to other ranks
/// 3. Other ranks deserialize via [`NcclBootstrap::deserialize`]
/// 4. All ranks collectively call [`NcclBootstrap::init_communicator`]
///
/// # Thread Safety
///
/// The bootstrap object itself is not thread-safe, but multiple threads can
/// each have their own bootstrap object with the same unique ID to initialize
/// communicators on different devices.
#[derive(Clone)]
pub struct NcclBootstrap {
nccl_id: ncclUniqueId,
world_size: usize,
}
impl NcclBootstrap {
/// Generate a new bootstrap on rank 0.
///
/// This creates a unique NCCL ID that must be shared with all other ranks
/// before they can initialize their communicators.
///
/// # Arguments
/// * `world_size` - Total number of ranks in the collective group
///
/// # Returns
/// A bootstrap object that can be serialized and distributed to other ranks.
///
/// # Errors
/// Returns an error if NCCL fails to generate a unique ID.
pub fn generate(world_size: usize) -> Result<Self> {
anyhow::ensure!(
world_size > 0 && world_size <= i32::MAX as usize,
"world_size must be in 1..={}, got {}",
i32::MAX,
world_size
);
let mut nccl_id = MaybeUninit::<ncclUniqueId>::uninit();
// SAFETY: ncclGetUniqueId initializes the ncclUniqueId struct
let result = unsafe { ncclGetUniqueId(nccl_id.as_mut_ptr()) };
check_nccl_result(result).context("Failed to generate NCCL unique ID")?;
// SAFETY: ncclGetUniqueId has initialized the struct
let nccl_id = unsafe { nccl_id.assume_init() };
Ok(Self {
nccl_id,
world_size,
})
}
/// Get the world size for this bootstrap.
pub fn world_size(&self) -> usize {
self.world_size
}
/// Serialize the bootstrap for transmission to other ranks.
///
/// The serialized format is:
/// - 8 bytes: world_size as little-endian u64
/// - 128 bytes: NCCL unique ID internal data
///
/// # Returns
/// A byte vector that can be transmitted via any IPC mechanism.
pub fn serialize(&self) -> Vec<u8> {
let mut bytes = Vec::with_capacity(8 + 128);
bytes.extend_from_slice(&(self.world_size as u64).to_le_bytes());
// Convert NcclByte array to u8 for serialization
for &byte in &self.nccl_id.internal {
bytes.push(byte as u8);
}
bytes
}
/// Deserialize a bootstrap received from rank 0.
///
/// # Arguments
/// * `bytes` - Serialized bootstrap data from [`NcclBootstrap::serialize`]
///
/// # Returns
/// A bootstrap object that can be used to initialize a communicator.
///
/// # Errors
/// Returns an error if the byte array has incorrect length.
pub fn deserialize(bytes: &[u8]) -> Result<Self> {
if bytes.len() != 8 + 128 {
anyhow::bail!(
"Invalid bootstrap data length: expected {}, got {}",
8 + 128,
bytes.len()
);
}
let world_size = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize;
let mut nccl_id = ncclUniqueId {
internal: [0 as NcclByte; 128],
};
// Copy bytes into internal array
for (i, &byte) in bytes[8..].iter().enumerate() {
nccl_id.internal[i] = byte as NcclByte;
}
Ok(Self {
nccl_id,
world_size,
})
}
/// Initialize an NCCL communicator for this rank.
///
/// This is a **collective operation** - all ranks must call this method
/// simultaneously with the same bootstrap data for initialization to succeed.
///
/// # Arguments
/// * `rank` - The rank of this worker (0 to world_size-1)
/// * `stream` - The CUDA stream to associate with NCCL operations
///
/// # Returns
/// An NCCL communicator handle that can be used for collective operations.
///
/// # Safety
/// The returned communicator must be destroyed with `ncclCommDestroy` when
/// no longer needed. The caller is responsible for lifetime management.
///
/// # Errors
/// Returns an error if:
/// - `rank` is >= `world_size`
/// - NCCL initialization fails (e.g., network issues, GPU errors)
/// - Not all ranks call this method (will hang)
pub fn init_communicator(&self, rank: usize, _stream: CUstream) -> Result<ncclComm_t> {
if rank >= self.world_size {
anyhow::bail!(
"Rank {} is invalid for world_size {}",
rank,
self.world_size
);
}
anyhow::ensure!(
self.world_size <= i32::MAX as usize,
"world_size {} exceeds i32::MAX",
self.world_size
);
let mut comm = MaybeUninit::<ncclComm_t>::uninit();
// SAFETY: ncclCommInitRank is a collective call that initializes the communicator.
// All ranks must call this with the same nccl_id for it to complete.
let result = unsafe {
ncclCommInitRank(
comm.as_mut_ptr(),
self.world_size as i32,
self.nccl_id,
rank as i32,
)
};
check_nccl_result(result).context("Failed to initialize NCCL communicator")?;
// SAFETY: ncclCommInitRank has initialized the communicator
let comm = unsafe { comm.assume_init() };
tracing::debug!(
rank,
world_size = self.world_size,
"NCCL communicator initialized"
);
Ok(comm)
}
}
/// Check an NCCL result and convert to anyhow::Result.
pub(crate) fn check_nccl_result(result: ncclResult_t) -> Result<()> {
if result == ncclResult_t::ncclSuccess {
Ok(())
} else {
anyhow::bail!("NCCL error: {:?}", result)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bootstrap_serialization_roundtrip() {
// Note: This test doesn't actually call NCCL functions,
// it just tests the serialization logic
let world_size = 4;
// Create a bootstrap with a dummy ID (we can't call ncclGetUniqueId without NCCL)
let original = NcclBootstrap {
nccl_id: ncclUniqueId {
internal: [42 as NcclByte; 128],
},
world_size,
};
let bytes = original.serialize();
assert_eq!(bytes.len(), 8 + 128);
let deserialized = NcclBootstrap::deserialize(&bytes).unwrap();
assert_eq!(deserialized.world_size, world_size);
assert_eq!(deserialized.nccl_id.internal, original.nccl_id.internal);
}
#[test]
fn test_deserialize_invalid_length() {
let bytes = vec![0u8; 10]; // Wrong length
let result = NcclBootstrap::deserialize(&bytes);
assert!(result.is_err());
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Collective communication operations for distributed workers.
//!
//! This module provides infrastructure for collective operations needed by
//! replicated data workers. It defines the [`CollectiveOps`] trait and provides
//! multiple implementations:
//!
//! - [`StubCollectiveOps`]: No-op implementation for testing and single-worker scenarios
//! - [`NcclCollectives`]: NCCL-based implementation for GPU collective operations (requires `nccl` feature)
//!
//! # Architecture
//!
//! In MLA (Multi-head Latent Attention) scenarios, KV blocks are replicated across
//! all workers rather than sharded. This means only rank 0 needs G2/G3 storage -
//! other ranks receive data via broadcast from rank 0 after it loads from G2/G3.
//!
//! ```text
//! Rank 0: G3 (disk) ←→ G2 (host) ←→ G1 (GPU) ───broadcast──→ Other ranks G1
//! Rank 1-N: [no G2/G3] G1 (GPU) ←──────────────────────┘
//! ```
//!
//! # Example
//!
//! ```rust,ignore
//! use kvbm::v2::distributed::collectives::{CollectiveOps, StubCollectiveOps};
//!
//! let collective = StubCollectiveOps::new(events);
//!
//! // Broadcast G1 blocks from rank 0 to all ranks
//! let notification = collective.broadcast(
//! LogicalLayoutHandle::G1,
//! LogicalLayoutHandle::G1,
//! &src_block_ids,
//! &dst_block_ids,
//! Some(0..32),
//! )?;
//! notification.await_completion()?;
//! ```
mod stub;
#[cfg(feature = "nccl")]
mod bootstrap;
#[cfg(feature = "nccl")]
mod nccl;
pub use stub::StubCollectiveOps;
#[cfg(feature = "nccl")]
pub use bootstrap::NcclBootstrap;
#[cfg(feature = "nccl")]
pub use nccl::{CudaEventRegistrar, LayoutResolver, NcclCollectives};
use std::ops::Range;
use anyhow::Result;
use crate::BlockId;
use kvbm_common::LogicalLayoutHandle;
use kvbm_physical::transfer::TransferCompleteNotification;
/// Collective communication operations for distributed workers.
///
/// This trait defines the collective operations needed by replicated data workers
/// to broadcast data across ranks. Implementations may use NCCL, NIXL, or other
/// collective communication libraries.
///
/// # Thread Safety
///
/// Implementations must be `Send + Sync` to allow sharing across threads.
/// NCCL operations are inherently thread-safe when used correctly (one stream
/// per communicator per thread).
pub trait CollectiveOps: Send + Sync {
/// Broadcast blocks from rank 0 to all other ranks.
///
/// This operation transfers the specified blocks from the source layout on
/// rank 0 to the destination layout on all other ranks. Optionally, a layer
/// range can be specified to transfer only a subset of layers (for pipelined
/// loading).
///
/// # Arguments
/// * `src` - The source logical layout (typically G1 on rank 0)
/// * `dst` - The destination logical layout (typically G1 on all ranks)
/// * `src_block_ids` - The block IDs to read from on the source
/// * `dst_block_ids` - The block IDs to write to on the destination
/// * `layer_range` - Optional range of layers to transfer. If None, all layers are transferred.
///
/// # Returns
/// A notification that completes when the broadcast is done on all ranks.
///
/// # Synchronization
///
/// This is a collective operation - all ranks must call this method with
/// the same arguments for the broadcast to complete correctly. The returned
/// notification signals local completion; global completion is guaranteed
/// by the collective semantics of the underlying implementation.
fn broadcast(
&self,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: &[BlockId],
dst_block_ids: &[BlockId],
layer_range: Option<Range<usize>>,
) -> Result<TransferCompleteNotification>;
/// Get the rank of this worker in the collective group.
fn rank(&self) -> usize;
/// Get the total number of workers in the collective group.
fn world_size(&self) -> usize;
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! NCCL-based collective operations for GPU-to-GPU communication.
//!
//! This module provides [`NcclCollectives`], an implementation of [`CollectiveOps`]
//! that uses NVIDIA NCCL for efficient GPU collective communication.
//!
//! # Construction Paths
//!
//! NCCL communicators can be obtained via two paths:
//!
//! ## Path A: Bootstrap (tests and standalone Rust apps)
//!
//! Use [`NcclCollectives::from_bootstrap`] when creating communicators from scratch:
//!
//! ```rust,ignore
//! let bootstrap = NcclBootstrap::generate(world_size)?;
//! // ... distribute bootstrap to other ranks ...
//! let collectives = NcclCollectives::from_bootstrap(
//! &bootstrap,
//! rank,
//! cuda_context,
//! event_registrar,
//! layout_resolver,
//! )?;
//! ```
//!
//! ## Path B: Borrowed handles (production with PyTorch/vLLM/TensorRT-LLM)
//!
//! Use [`NcclCollectives::from_borrowed`] when an external runtime provides the communicator:
//!
//! ```rust,ignore
//! // In Python: comm_ptr = dist._get_default_group()._get_backend()._nccl_comm.as_int()
//! let collectives = unsafe {
//! NcclCollectives::from_borrowed(
//! comm_ptr,
//! stream_ptr,
//! rank,
//! world_size,
//! cuda_context,
//! event_registrar,
//! layout_resolver,
//! )
//! };
//! ```
//!
//! # Thread Safety
//!
//! NCCL operations are thread-safe when each thread uses its own stream. This
//! implementation uses a dedicated NCCL stream per `NcclCollectives` instance.
use std::ops::Range;
use std::sync::Arc;
use anyhow::{Context, Result};
use cudarc::driver::sys::CUstream;
use cudarc::driver::{CudaContext, CudaEvent, CudaStream};
use cudarc::nccl::sys::{
ncclBcast, ncclComm_t, ncclCommDestroy, ncclDataType_t, ncclGroupEnd, ncclGroupStart,
};
use velo::EventManager;
use crate::BlockId;
use kvbm_common::LogicalLayoutHandle;
use kvbm_physical::layout::PhysicalLayout;
use kvbm_physical::transfer::TransferCompleteNotification;
use super::CollectiveOps;
use super::bootstrap::{NcclBootstrap, check_nccl_result};
/// Trait for resolving logical layout handles to physical layouts.
///
/// This trait decouples [`NcclCollectives`] from [`PhysicalWorker`], allowing
/// the collective operations to work with any layout resolution strategy.
pub trait LayoutResolver: Send + Sync {
/// Resolve a logical layout handle to a physical layout.
///
/// # Arguments
/// * `logical` - The logical layout handle (G1, G2, G3)
///
/// # Returns
/// The physical layout for the given logical handle, or an error if not found.
fn resolve_layout(&self, logical: LogicalLayoutHandle) -> Result<PhysicalLayout>;
}
/// Trait for registering CUDA events for completion notification.
///
/// This trait abstracts the CUDA event registration mechanism, allowing
/// [`NcclCollectives`] to integrate with different event polling systems.
/// Implementations should use efficient background polling rather than
/// spawning individual tasks per event.
///
/// The primary implementation wraps `TransferContext::register_cuda_event`,
/// which uses a shared background task for polling multiple events.
pub trait CudaEventRegistrar: Send + Sync {
/// Register a CUDA event for completion notification.
///
/// The returned notification will complete when the CUDA event has been
/// signaled (i.e., all operations recorded before the event have completed).
///
/// # Arguments
/// * `event` - The CUDA event to monitor
///
/// # Returns
/// A notification that completes when the event is signaled.
fn register_cuda_event(&self, event: CudaEvent) -> TransferCompleteNotification;
}
/// Ownership mode for the NCCL communicator.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CommOwnership {
/// We own the communicator and must destroy it on drop.
Owned,
/// The communicator is borrowed from external code (PyTorch, etc.).
Borrowed,
}
/// Stream wrapper that can be either owned or borrowed.
enum NcclStream {
/// Owned CudaStream - we control its lifetime
Owned(Arc<CudaStream>),
/// Borrowed raw stream pointer - caller controls lifetime
Borrowed(CUstream),
}
impl NcclStream {
/// Get the raw CUDA stream pointer for NCCL calls.
fn raw(&self) -> CUstream {
match self {
NcclStream::Owned(stream) => stream.cu_stream(),
NcclStream::Borrowed(ptr) => *ptr,
}
}
/// Get the owned stream (for recording events). Only available for owned streams.
fn as_owned(&self) -> Option<&Arc<CudaStream>> {
match self {
NcclStream::Owned(stream) => Some(stream),
NcclStream::Borrowed(_) => None,
}
}
}
/// NCCL-based collective operations for GPU-to-GPU communication.
///
/// This implementation uses NVIDIA NCCL for efficient broadcast operations
/// across GPUs. It supports both owned communicators (created via bootstrap)
/// and borrowed communicators (from PyTorch, vLLM, etc.).
///
/// # Performance
///
/// Broadcast operations use NCCL groups to batch multiple memory region
/// transfers into a single collective operation, minimizing synchronization
/// overhead.
pub struct NcclCollectives {
/// NCCL communicator handle
comm: ncclComm_t,
/// Whether we own the communicator (and must destroy it on drop)
ownership: CommOwnership,
/// Rank of this worker in the collective group
rank: usize,
/// Total number of workers in the collective group
world_size: usize,
/// CUDA stream for NCCL operations (owned or borrowed)
nccl_stream: NcclStream,
/// CUDA context for stream/event management (only used for owned mode)
#[allow(dead_code)]
cuda_context: Arc<CudaContext>,
/// Event system for completion notifications (used for borrowed stream fallback)
event_system: EventManager,
/// CUDA event registrar for efficient completion notification
event_registrar: Arc<dyn CudaEventRegistrar>,
/// Layout resolver for mapping logical handles to physical layouts
layout_resolver: Arc<dyn LayoutResolver>,
}
impl NcclCollectives {
// =========================================================================
// Path A: Create from scratch (used by tests, standalone Rust apps)
// =========================================================================
/// Create with a new NCCL communicator initialized from bootstrap info.
///
/// This is a **collective operation** - all ranks must call simultaneously
/// with the same bootstrap data for initialization to succeed.
///
/// # Arguments
/// * `bootstrap` - Bootstrap data containing the NCCL unique ID
/// * `rank` - The rank of this worker (0 to world_size-1)
/// * `cuda_context` - CUDA context for stream management
/// * `event_system` - Event system for fallback completion notifications
/// * `event_registrar` - Registrar for efficient CUDA event completion polling
/// * `layout_resolver` - Resolver for mapping logical handles to physical layouts
///
/// # Returns
/// A new `NcclCollectives` instance that owns its communicator.
///
/// # Errors
/// Returns an error if NCCL initialization fails.
pub fn from_bootstrap(
bootstrap: &NcclBootstrap,
rank: usize,
cuda_context: Arc<CudaContext>,
event_system: EventManager,
event_registrar: Arc<dyn CudaEventRegistrar>,
layout_resolver: Arc<dyn LayoutResolver>,
) -> Result<Self> {
let nccl_stream = cuda_context
.new_stream()
.context("Failed to create NCCL stream")?;
let comm = bootstrap
.init_communicator(rank, nccl_stream.cu_stream())
.context("Failed to initialize NCCL communicator")?;
Ok(Self {
comm,
ownership: CommOwnership::Owned,
rank,
world_size: bootstrap.world_size(),
nccl_stream: NcclStream::Owned(nccl_stream),
cuda_context,
event_system,
event_registrar,
layout_resolver,
})
}
// =========================================================================
// Path B: Borrow existing communicator (production use with Python/C/C++)
// =========================================================================
/// Create from borrowed NCCL handles passed from external code.
///
/// This is the primary production path when the NCCL communicator is
/// initialized by Python (torch.distributed), C++, or another runtime.
///
/// # Arguments
/// * `comm_ptr` - Raw pointer to `ncclComm_t` handle (cast to usize)
/// * `stream_ptr` - Raw pointer to `cudaStream_t` handle (cast to usize)
/// * `rank` - The rank of this worker in the collective group
/// * `world_size` - Total number of workers in the collective group
/// * `cuda_context` - CUDA context for event management
/// * `event_system` - Event system for fallback completion notifications
/// * `event_registrar` - Registrar for efficient CUDA event completion polling
/// * `layout_resolver` - Resolver for mapping logical handles to physical layouts
///
/// # Safety
/// - `comm_ptr` must be a valid `ncclComm_t` handle
/// - `stream_ptr` must be a valid `cudaStream_t` handle
/// - The caller must ensure the handles outlive this struct
/// - The communicator must not be destroyed while this struct exists
///
/// # FFI Example (Python via PyO3)
/// ```python
/// # In Python
/// comm = torch.distributed.distributed_c10d._get_default_group()._get_backend()._nccl_comm
/// stream = torch.cuda.current_stream()
///
/// # Pass to Rust
/// collectives = NcclCollectives.from_borrowed(
/// comm_ptr=comm.as_int(),
/// stream_ptr=stream.cuda_stream,
/// rank=rank,
/// world_size=world_size,
/// )
/// ```
///
/// # FFI Example (C/C++)
/// ```c
/// // In C/C++
/// ncclComm_t comm;
/// ncclCommInitRank(&comm, world_size, id, rank);
/// cudaStream_t stream;
/// cudaStreamCreate(&stream);
///
/// // Pass to Rust via FFI
/// nccl_collectives_from_borrowed((uintptr_t)comm, (uintptr_t)stream, rank, world_size);
/// ```
#[allow(clippy::too_many_arguments)]
pub unsafe fn from_borrowed(
comm_ptr: usize,
stream_ptr: usize,
rank: usize,
world_size: usize,
cuda_context: Arc<CudaContext>,
event_system: EventManager,
event_registrar: Arc<dyn CudaEventRegistrar>,
layout_resolver: Arc<dyn LayoutResolver>,
) -> Self {
Self {
comm: comm_ptr as ncclComm_t,
ownership: CommOwnership::Borrowed,
rank,
world_size,
nccl_stream: NcclStream::Borrowed(stream_ptr as CUstream),
cuda_context,
event_system,
event_registrar,
layout_resolver,
}
}
/// Broadcast memory regions using NCCL grouped operations.
///
/// # Arguments
/// * `regions` - Vector of (ptr, size) pairs for memory regions to broadcast
/// * `root` - Root rank for the broadcast
///
/// # Errors
/// Returns an error if any NCCL operation fails.
fn broadcast_regions(&self, regions: &[(usize, usize)], root: i32) -> Result<()> {
if regions.is_empty() {
return Ok(());
}
let stream = self.nccl_stream.raw();
// Start NCCL group - batches operations for efficiency
let result = unsafe { ncclGroupStart() };
check_nccl_result(result).context("ncclGroupStart failed")?;
// Queue all broadcasts within the group
for (ptr, size) in regions {
// SAFETY: We're calling NCCL with valid pointers within a group operation.
// The stream cast is safe because both cudarc::driver::sys::CUstream and
// cudarc::nccl::sys::CUstream are the same underlying CUDA type (*mut CUstream_st).
let result = unsafe {
ncclBcast(
*ptr as *mut std::ffi::c_void,
*size,
ncclDataType_t::ncclChar, // byte-level transfer
root,
self.comm,
stream.cast(),
)
};
check_nccl_result(result).context("ncclBcast failed")?;
}
// End group - submits all queued ops to GPU
let result = unsafe { ncclGroupEnd() };
check_nccl_result(result).context("ncclGroupEnd failed")?;
Ok(())
}
/// Collect memory regions for a set of blocks and layers.
///
/// # Arguments
/// * `layout` - Physical layout to query
/// * `block_ids` - Block IDs to collect regions for
/// * `layer_range` - Range of layers to include (None = all layers)
///
/// # Returns
/// Vector of (address, size) pairs for the requested regions.
fn collect_regions(
&self,
layout: &PhysicalLayout,
block_ids: &[BlockId],
layer_range: Option<Range<usize>>,
) -> Result<Vec<(usize, usize)>> {
let num_layers = layout.layout().num_layers();
let outer_dim = layout.layout().outer_dim();
let layer_range = layer_range.unwrap_or(0..num_layers);
let mut regions =
Vec::with_capacity(block_ids.len() * (layer_range.end - layer_range.start) * outer_dim);
for &block_id in block_ids {
for layer_id in layer_range.clone() {
for outer_id in 0..outer_dim {
let region = layout.memory_region(block_id, layer_id, outer_id)?;
regions.push((region.addr, region.size));
}
}
}
Ok(regions)
}
/// Create a completion notification by recording an event on the NCCL stream.
fn create_completion_notification(&self) -> Result<TransferCompleteNotification> {
// For owned streams, we can record an event and use the efficient registrar
if let Some(stream) = self.nccl_stream.as_owned() {
let cuda_event = stream
.record_event(None)
.context("Failed to record CUDA event")?;
// Use the event registrar for efficient background polling
Ok(self.event_registrar.register_cuda_event(cuda_event))
} else {
// For borrowed streams, we can't easily record events since we don't
// have ownership. Return an immediate completion notification.
// The caller is responsible for synchronization with the borrowed stream.
tracing::warn!(
"Using borrowed stream - returning immediate completion. \
Caller must ensure stream synchronization."
);
let nova_event = self.event_system.new_event()?;
let handle = nova_event.handle();
nova_event.trigger()?;
let awaiter = self.event_system.awaiter(handle)?;
Ok(TransferCompleteNotification::from_awaiter(awaiter))
}
}
}
impl CollectiveOps for NcclCollectives {
fn broadcast(
&self,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: &[BlockId],
dst_block_ids: &[BlockId],
layer_range: Option<Range<usize>>,
) -> Result<TransferCompleteNotification> {
// Resolve layouts
let src_layout = self.layout_resolver.resolve_layout(src)?;
let dst_layout = self.layout_resolver.resolve_layout(dst)?;
// For broadcast, rank 0 uses src, other ranks use dst
let layout = if self.rank == 0 {
&src_layout
} else {
&dst_layout
};
let block_ids = if self.rank == 0 {
src_block_ids
} else {
dst_block_ids
};
// Collect memory regions for the broadcast
let regions = self.collect_regions(layout, block_ids, layer_range)?;
tracing::debug!(
rank = self.rank,
world_size = self.world_size,
num_regions = regions.len(),
total_bytes = regions.iter().map(|(_, size)| size).sum::<usize>(),
"Starting NCCL broadcast"
);
// Execute grouped broadcast (rank 0 is always root for broadcast)
self.broadcast_regions(&regions, 0)?;
// Create completion notification
self.create_completion_notification()
}
fn rank(&self) -> usize {
self.rank
}
fn world_size(&self) -> usize {
self.world_size
}
}
impl Drop for NcclCollectives {
fn drop(&mut self) {
if self.ownership == CommOwnership::Owned {
// SAFETY: We own this communicator and it's valid
let result = unsafe { ncclCommDestroy(self.comm) };
if let Err(e) = check_nccl_result(result) {
tracing::warn!("Failed to destroy NCCL communicator: {:?}", e);
}
}
}
}
// SAFETY: NcclCollectives can be sent between threads.
// The NCCL communicator itself is thread-safe when operations use
// the same stream (which we guarantee by having a dedicated stream).
unsafe impl Send for NcclCollectives {}
// SAFETY: NcclCollectives can be shared between threads.
// All mutable state is behind Arc or atomic operations, and NCCL
// operations are thread-safe when using the same stream.
unsafe impl Sync for NcclCollectives {}
#[cfg(test)]
mod tests {
use super::*;
use cudarc::driver::{CudaContext, CudaSlice, DevicePtr};
use cudarc::nccl::sys::{ncclCommDestroy, ncclCommInitAll};
use std::ffi::c_int;
use std::sync::{Arc, Barrier};
use std::thread;
/// Get the number of CUDA devices available.
fn cuda_device_count() -> usize {
CudaContext::device_count().unwrap_or(0) as usize
}
/// Initialize NCCL communicators for all devices using ncclCommInitAll.
///
/// This is the single-process multi-GPU initialization pattern.
/// Returns a vector of communicator handles as usize (for Send).
unsafe fn init_all_comms(num_devices: usize) -> Result<Vec<usize>> {
let mut comms: Vec<ncclComm_t> = vec![std::ptr::null_mut(); num_devices];
let devices: Vec<c_int> = (0..num_devices as c_int).collect();
// SAFETY: ncclCommInitAll is safe to call with valid pointers
let result =
unsafe { ncclCommInitAll(comms.as_mut_ptr(), num_devices as c_int, devices.as_ptr()) };
check_nccl_result(result).context("ncclCommInitAll failed")?;
// Convert to usize for Send
Ok(comms.into_iter().map(|c| c as usize).collect())
}
/// Clean up NCCL communicators.
unsafe fn destroy_comms(comms: &[usize]) {
for &comm in comms {
// SAFETY: Converting back from usize and destroying
unsafe {
let _ = ncclCommDestroy(comm as ncclComm_t);
}
}
}
/// Helper to get device pointer from CudaSlice with stream.
fn get_device_ptr(slice: &CudaSlice<u8>, stream: &CudaStream) -> usize {
let (ptr, _guard) = slice.device_ptr(stream);
ptr as usize
}
// NOTE: These NCCL tests require a full NCCL installation with all symbols.
// Some stripped NCCL builds (e.g., Lambda Labs' 2.26.2-0lambda1) are missing
// ncclAlltoAll, ncclGather, ncclScatter, etc. which cudarc requires.
// If tests fail with "undefined symbol: ncclAlltoAll", install official NVIDIA NCCL.
#[test]
#[cfg(feature = "testing-nccl")]
fn test_nccl_broadcast_multi_gpu_raw() {
// Skip if < 2 GPUs available
let num_devices = cuda_device_count();
if num_devices < 2 {
println!(
"Skipping test: {} GPUs available, need at least 2",
num_devices
);
return;
}
// Use 2 GPUs for the test
let world_size = 2;
println!("Testing NCCL broadcast with {} GPUs", world_size);
// Initialize all communicators at once (single-process pattern)
let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
// Create CUDA contexts and streams for each device
let contexts: Vec<Arc<CudaContext>> = (0..world_size)
.map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
.collect();
let streams: Vec<Arc<CudaStream>> = contexts
.iter()
.map(|ctx| ctx.new_stream().expect("Failed to create stream"))
.collect();
// Test data
let test_size = 1024 * 1024; // 1 MB
let test_pattern: u8 = 0xAB;
// Allocate device buffers using streams
let buffers: Vec<CudaSlice<u8>> = streams
.iter()
.map(|stream| {
// Allocate zeroed buffer
let zeros = vec![0u8; test_size];
stream
.clone_htod(&zeros)
.expect("Failed to allocate buffer")
})
.collect();
// Fill rank 0's buffer with test pattern
{
let host_data = vec![test_pattern; test_size];
let buffer = streams[0]
.clone_htod(&host_data)
.expect("Failed to copy to device 0");
// Copy to actual buffer location
let src_ptr = get_device_ptr(&buffer, &streams[0]);
let dst_ptr = get_device_ptr(&buffers[0], &streams[0]);
unsafe {
cudarc::driver::result::memcpy_dtod_async(
dst_ptr as u64,
src_ptr as u64,
test_size,
streams[0].cu_stream(),
)
.expect("dtod copy failed");
}
streams[0].synchronize().expect("sync failed");
}
// Get buffer pointers before spawning threads (to avoid lifetime issues)
let buffer_ptrs: Vec<usize> = buffers
.iter()
.zip(streams.iter())
.map(|(buf, stream)| get_device_ptr(buf, stream))
.collect();
// Synchronization barrier for threads
let barrier = Arc::new(Barrier::new(world_size));
// Spawn threads to perform broadcast
let handles: Vec<_> = (0..world_size)
.map(|rank| {
let comm = comms[rank]; // Already usize, which is Send
let stream = streams[rank].clone();
let buffer_ptr = buffer_ptrs[rank];
let barrier = barrier.clone();
thread::spawn(move || {
// Wait for all threads to be ready
barrier.wait();
// Perform broadcast (rank 0 is root)
let result = unsafe {
ncclBcast(
buffer_ptr as *mut std::ffi::c_void,
test_size,
ncclDataType_t::ncclChar,
0, // root rank
comm as ncclComm_t, // Convert back to ncclComm_t
stream.cu_stream().cast(),
)
};
check_nccl_result(result).expect("ncclBcast failed");
// Synchronize stream
stream.synchronize().expect("Stream sync failed");
println!("Rank {} completed broadcast", rank);
})
})
.collect();
// Wait for all threads to complete
for handle in handles {
handle.join().expect("Thread panicked");
}
// Verify all buffers have the test pattern
for (rank, (stream, buffer)) in streams.iter().zip(buffers.iter()).enumerate() {
let host_data = stream
.clone_dtoh(buffer)
.expect("Failed to copy from device");
// Check first and last bytes, plus some random samples
assert_eq!(
host_data[0], test_pattern,
"Rank {} first byte mismatch",
rank
);
assert_eq!(
host_data[test_size - 1],
test_pattern,
"Rank {} last byte mismatch",
rank
);
assert_eq!(
host_data[test_size / 2],
test_pattern,
"Rank {} middle byte mismatch",
rank
);
// Verify all bytes
let mismatch_count = host_data.iter().filter(|&&b| b != test_pattern).count();
assert_eq!(
mismatch_count, 0,
"Rank {} has {} mismatched bytes",
rank, mismatch_count
);
println!("Rank {} verified: all {} bytes correct", rank, test_size);
}
// Clean up
unsafe { destroy_comms(&comms) };
println!("Test passed!");
}
#[test]
#[cfg(feature = "testing-nccl")]
fn test_nccl_grouped_broadcast_multi_gpu() {
// Skip if < 2 GPUs available
let num_devices = cuda_device_count();
if num_devices < 2 {
println!(
"Skipping test: {} GPUs available, need at least 2",
num_devices
);
return;
}
// Use 2 GPUs for the test
let world_size = 2;
println!("Testing NCCL grouped broadcast with {} GPUs", world_size);
// Initialize all communicators at once
let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
// Create CUDA contexts and streams
let contexts: Vec<Arc<CudaContext>> = (0..world_size)
.map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
.collect();
let streams: Vec<Arc<CudaStream>> = contexts
.iter()
.map(|ctx| ctx.new_stream().expect("Failed to create stream"))
.collect();
// Test multiple regions (simulating multiple blocks)
let num_regions = 4;
let region_size = 256 * 1024; // 256 KB per region
// Allocate multiple buffers per device
let buffers: Vec<Vec<CudaSlice<u8>>> = streams
.iter()
.map(|stream| {
(0..num_regions)
.map(|_| {
let zeros = vec![0u8; region_size];
stream.clone_htod(&zeros).expect("Failed to allocate")
})
.collect()
})
.collect();
// Fill rank 0's buffers with different patterns
for (region_idx, buffer) in buffers[0].iter().enumerate() {
let pattern = (region_idx + 1) as u8 * 0x11; // Different pattern per region
let host_data = vec![pattern; region_size];
let src_buffer = streams[0]
.clone_htod(&host_data)
.expect("Failed to allocate src");
let src_ptr = get_device_ptr(&src_buffer, &streams[0]);
let dst_ptr = get_device_ptr(buffer, &streams[0]);
unsafe {
cudarc::driver::result::memcpy_dtod_async(
dst_ptr as u64,
src_ptr as u64,
region_size,
streams[0].cu_stream(),
)
.expect("dtod copy failed");
}
}
streams[0].synchronize().expect("sync failed");
// Synchronization barrier
let barrier = Arc::new(Barrier::new(world_size));
// Collect buffer pointers for each rank (as usize for Send)
let buffer_ptrs: Vec<Vec<usize>> = buffers
.iter()
.zip(streams.iter())
.map(|(rank_buffers, stream)| {
rank_buffers
.iter()
.map(|b| get_device_ptr(b, stream))
.collect()
})
.collect();
// Spawn threads for grouped broadcast
let handles: Vec<_> = (0..world_size)
.map(|rank| {
let comm = comms[rank]; // Already usize, which is Send
let stream = streams[rank].clone();
let ptrs = buffer_ptrs[rank].clone();
let barrier = barrier.clone();
thread::spawn(move || {
barrier.wait();
// Use NCCL group for multiple broadcasts
unsafe {
check_nccl_result(ncclGroupStart()).expect("ncclGroupStart failed");
for ptr in &ptrs {
let result = ncclBcast(
*ptr as *mut std::ffi::c_void,
region_size,
ncclDataType_t::ncclChar,
0,
comm as ncclComm_t,
stream.cu_stream().cast(),
);
check_nccl_result(result).expect("ncclBcast failed");
}
check_nccl_result(ncclGroupEnd()).expect("ncclGroupEnd failed");
}
stream.synchronize().expect("Stream sync failed");
println!("Rank {} completed grouped broadcast", rank);
})
})
.collect();
for handle in handles {
handle.join().expect("Thread panicked");
}
// Verify all ranks have correct data
for (rank, (stream, rank_buffers)) in streams.iter().zip(buffers.iter()).enumerate() {
for (region_idx, buffer) in rank_buffers.iter().enumerate() {
let expected_pattern = (region_idx + 1) as u8 * 0x11;
let host_data = stream
.clone_dtoh(buffer)
.expect("Failed to copy from device");
let mismatch_count = host_data.iter().filter(|&&b| b != expected_pattern).count();
assert_eq!(
mismatch_count, 0,
"Rank {} region {} has {} mismatched bytes (expected 0x{:02x})",
rank, region_idx, mismatch_count, expected_pattern
);
}
println!(
"Rank {} verified: all {} regions correct",
rank, num_regions
);
}
unsafe { destroy_comms(&comms) };
println!("Grouped broadcast test passed!");
}
#[test]
#[cfg(feature = "testing-nccl")]
fn test_nccl_broadcast_large_transfer() {
// Skip if < 2 GPUs available
let num_devices = cuda_device_count();
if num_devices < 2 {
println!(
"Skipping test: {} GPUs available, need at least 2",
num_devices
);
return;
}
let world_size = 2;
println!("Testing NCCL large broadcast with {} GPUs", world_size);
let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
let contexts: Vec<Arc<CudaContext>> = (0..world_size)
.map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
.collect();
let streams: Vec<Arc<CudaStream>> = contexts
.iter()
.map(|ctx| ctx.new_stream().expect("Failed to create stream"))
.collect();
// Large transfer: 64 MB (typical KV cache block size)
let test_size = 64 * 1024 * 1024;
println!("Transfer size: {} MB", test_size / (1024 * 1024));
// Allocate buffers
let buffers: Vec<CudaSlice<u8>> = streams
.iter()
.map(|stream| {
let zeros = vec![0u8; test_size];
stream.clone_htod(&zeros).expect("Failed to allocate")
})
.collect();
// Fill rank 0 with pseudo-random pattern
{
let host_data: Vec<u8> = (0..test_size).map(|i| (i % 256) as u8).collect();
let src_buffer = streams[0]
.clone_htod(&host_data)
.expect("Failed to copy to device 0");
let src_ptr = get_device_ptr(&src_buffer, &streams[0]);
let dst_ptr = get_device_ptr(&buffers[0], &streams[0]);
unsafe {
cudarc::driver::result::memcpy_dtod_async(
dst_ptr as u64,
src_ptr as u64,
test_size,
streams[0].cu_stream(),
)
.expect("dtod copy failed");
}
streams[0].synchronize().expect("sync failed");
}
// Get buffer pointers
let buffer_ptrs: Vec<usize> = buffers
.iter()
.zip(streams.iter())
.map(|(buf, stream)| get_device_ptr(buf, stream))
.collect();
let barrier = Arc::new(Barrier::new(world_size));
let start = std::time::Instant::now();
// Spawn threads for large transfer
let handles: Vec<_> = (0..world_size)
.map(|rank| {
let comm = comms[rank]; // Already usize, which is Send
let stream = streams[rank].clone();
let buffer_ptr = buffer_ptrs[rank];
let barrier = barrier.clone();
thread::spawn(move || {
barrier.wait();
let result = unsafe {
ncclBcast(
buffer_ptr as *mut std::ffi::c_void,
test_size,
ncclDataType_t::ncclChar,
0,
comm as ncclComm_t,
stream.cu_stream().cast(),
)
};
check_nccl_result(result).expect("ncclBcast failed");
stream.synchronize().expect("Stream sync failed");
})
})
.collect();
for handle in handles {
handle.join().expect("Thread panicked");
}
let elapsed = start.elapsed();
let throughput_gbps =
(test_size as f64 / (1024.0 * 1024.0 * 1024.0)) / elapsed.as_secs_f64();
println!(
"Transfer completed in {:?} ({:.2} GB/s)",
elapsed, throughput_gbps
);
// Verify data on rank 1
{
let host_data = streams[1]
.clone_dtoh(&buffers[1])
.expect("Failed to copy from device 1");
// Sample verification (checking every byte would be slow)
let samples = [
0,
test_size / 4,
test_size / 2,
test_size * 3 / 4,
test_size - 1,
];
for &idx in &samples {
let expected = (idx % 256) as u8;
assert_eq!(
host_data[idx], expected,
"Mismatch at index {}: expected {}, got {}",
idx, expected, host_data[idx]
);
}
// Full verification with sampling
let mismatch_count = host_data
.iter()
.enumerate()
.filter(|(i, b)| **b != (*i % 256) as u8)
.count();
assert_eq!(
mismatch_count, 0,
"Found {} mismatched bytes",
mismatch_count
);
}
unsafe { destroy_comms(&comms) };
println!("Large transfer test passed!");
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Stub collective operations implementation.
//!
//! This module provides a no-op implementation of [`CollectiveOps`] for testing
//! and single-worker scenarios where no actual collective communication is needed.
use std::ops::Range;
use anyhow::Result;
use velo::EventManager;
use crate::BlockId;
use kvbm_common::LogicalLayoutHandle;
use kvbm_physical::transfer::TransferCompleteNotification;
use super::CollectiveOps;
/// Stub collective operations implementation.
///
/// This implementation completes immediately without actually performing any
/// collective communication. Use for testing or when collective operations
/// are not yet implemented (e.g., before NCCL integration).
///
/// # Safety
///
/// This stub does NOT perform actual data transfer. Using it in production
/// with `ReplicatedDataWorker` will result in incorrect behavior where
/// non-rank-0 workers have uninitialized data.
///
/// # Example
///
/// ```rust,ignore
/// use kvbm::v2::distributed::collectives::StubCollectiveOps;
///
/// let collective = StubCollectiveOps::new(events, 0, 1);
///
/// // Operations complete immediately without data transfer
/// let notification = collective.broadcast(
/// LogicalLayoutHandle::G1,
/// LogicalLayoutHandle::G1,
/// &src_block_ids,
/// &dst_block_ids,
/// None,
/// )?;
/// ```
pub struct StubCollectiveOps {
events: EventManager,
rank: usize,
world_size: usize,
}
impl StubCollectiveOps {
/// Create a new stub collective ops.
///
/// # Arguments
/// * `events` - The event system for creating completion notifications
/// * `rank` - The rank of this worker in the collective group
/// * `world_size` - The total number of workers in the collective group
pub fn new(events: EventManager, rank: usize, world_size: usize) -> Self {
Self {
events,
rank,
world_size,
}
}
/// Create a stub for single-worker scenarios (rank 0, world_size 1).
pub fn single_worker(events: EventManager) -> Self {
Self::new(events, 0, 1)
}
}
impl CollectiveOps for StubCollectiveOps {
fn broadcast(
&self,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: &[BlockId],
dst_block_ids: &[BlockId],
layer_range: Option<Range<usize>>,
) -> Result<TransferCompleteNotification> {
tracing::warn!(
rank = self.rank,
world_size = self.world_size,
?src,
?dst,
num_src_blocks = src_block_ids.len(),
num_dst_blocks = dst_block_ids.len(),
?layer_range,
"StubCollectiveOps::broadcast called - completing immediately without actual transfer"
);
// Create an event that's already triggered (immediate completion)
let event = self.events.new_event()?;
let handle = event.handle();
event.trigger()?;
let awaiter = self.events.awaiter(handle)?;
Ok(TransferCompleteNotification::from_awaiter(awaiter))
}
fn rank(&self) -> usize {
self.rank
}
fn world_size(&self) -> usize {
self.world_size
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Block accessor for policy-based scanning.
//!
//! Provides a stateless interface for acquiring blocks from G2/G3 tiers.
//! Designed for use with custom scanning policies that control iteration
//! and can yield results incrementally.
use crate::{BlockId, G2, G3, SequenceHash};
use kvbm_common::LogicalLayoutHandle;
use kvbm_logical::blocks::ImmutableBlock;
use super::InstanceLeader;
/// A block from either G2 or G3 tier.
///
/// Provides RAII ownership - blocks are released when dropped.
#[derive(Debug)]
pub enum TieredBlock {
/// Block from G2 (host memory) tier.
G2(ImmutableBlock<G2>),
/// Block from G3 (disk) tier.
G3(ImmutableBlock<G3>),
}
impl TieredBlock {
/// Get the storage tier of this block.
pub fn tier(&self) -> LogicalLayoutHandle {
match self {
TieredBlock::G2(_) => LogicalLayoutHandle::G2,
TieredBlock::G3(_) => LogicalLayoutHandle::G3,
}
}
/// Get the sequence hash.
pub fn sequence_hash(&self) -> SequenceHash {
match self {
TieredBlock::G2(b) => b.sequence_hash(),
TieredBlock::G3(b) => b.sequence_hash(),
}
}
/// Get the block ID.
pub fn block_id(&self) -> BlockId {
match self {
TieredBlock::G2(b) => b.block_id(),
TieredBlock::G3(b) => b.block_id(),
}
}
/// Get the position in the sequence (for ordering).
pub fn position(&self) -> u64 {
self.sequence_hash().position()
}
/// Check if this is a G2 block.
pub fn is_g2(&self) -> bool {
matches!(self, TieredBlock::G2(_))
}
/// Check if this is a G3 block.
pub fn is_g3(&self) -> bool {
matches!(self, TieredBlock::G3(_))
}
/// Convert to G2 block, consuming self.
pub fn into_g2(self) -> Option<ImmutableBlock<G2>> {
match self {
TieredBlock::G2(b) => Some(b),
TieredBlock::G3(_) => None,
}
}
/// Convert to G3 block, consuming self.
pub fn into_g3(self) -> Option<ImmutableBlock<G3>> {
match self {
TieredBlock::G3(b) => Some(b),
TieredBlock::G2(_) => None,
}
}
}
/// Stateless accessor for block acquisition.
///
/// Each method call is independent - no locks are held between calls.
/// This enables parallel policy execution (e.g., with rayon).
///
/// # Thread Safety
///
/// `BlockAccessor` is `Send + Sync` because:
/// - It only holds a shared reference to `InstanceLeader`
/// - `InstanceLeader` contains `Arc<BlockManager<T>>` which is `Send + Sync`
/// - All operations use internal locking per call
/// - No mutable state is held between method calls
pub struct BlockAccessor<'a> {
instance: &'a InstanceLeader,
touch: bool,
}
impl<'a> BlockAccessor<'a> {
/// Create a new accessor.
pub(crate) fn new(instance: &'a InstanceLeader, touch: bool) -> Self {
Self { instance, touch }
}
/// Find and take a block from G2 or G3.
///
/// Searches G2 first, then G3 if not found. The block is acquired/removed
/// from the pool - caller owns via RAII until dropped.
///
/// Returns `None` if the block is not found in either tier.
pub fn find(&self, hash: SequenceHash) -> Option<TieredBlock> {
// Try G2 first (match_blocks acquires the block)
let g2_matches = self.instance.g2_manager.match_blocks(&[hash]);
if let Some(block) = g2_matches.into_iter().next() {
return Some(TieredBlock::G2(block));
}
// Try G3 if available
if let Some(ref g3) = self.instance.g3_manager {
let g3_matches = g3.match_blocks(&[hash]);
if let Some(block) = g3_matches.into_iter().next() {
return Some(TieredBlock::G3(block));
}
}
None
}
/// Get the touch setting for this accessor.
///
/// When `true`, frequency tracking is updated on block access
/// (affects MultiLRU eviction priority).
pub fn touch(&self) -> bool {
self.touch
}
}
// Safety: BlockAccessor is Send + Sync because:
// - It only holds a shared reference to InstanceLeader
// - InstanceLeader contains Arc<BlockManager<T>> which is Send + Sync
// - All operations use internal locking per call (RwLock in InactivePool)
// - No mutable state is held between method calls
unsafe impl Send for BlockAccessor<'_> {}
unsafe impl Sync for BlockAccessor<'_> {}
/// Context for policy execution with result collection.
///
/// Provides access to the `BlockAccessor` for block lookups and a
/// `yield_item` method for streaming results back to the caller.
pub struct PolicyContext<'a, T> {
pub(crate) accessor: BlockAccessor<'a>,
pub(crate) results: Vec<T>,
}
impl<'a, T> PolicyContext<'a, T> {
/// Get access to the block accessor.
pub fn accessor(&self) -> &BlockAccessor<'a> {
&self.accessor
}
/// Yield a result item.
///
/// Items are collected and returned as a `Vec<T>` when the policy completes.
pub fn yield_item(&mut self, item: T) {
self.results.push(item);
}
/// Yield multiple result items at once.
pub fn yield_items(&mut self, items: impl IntoIterator<Item = T>) {
self.results.extend(items);
}
}
// =============================================================================
// TODO: Parallel policy support via rayon::scope
//
// Requirements to enable:
// 1. Add `rayon` to Cargo.toml dependencies
// 2. Ensure BlockAccessor is truly Send+Sync (verify internal locking is correct)
// 3. Add feature flag `parallel` to gate this code
// 4. Test thread-safety of concurrent BlockManager::match_blocks calls
// 5. Benchmark to ensure parallel overhead is worth it (likely only for large hash sets)
//
// The design uses rayon::scope instead of par_chunks because:
// - par_chunks could split across logical boundaries (e.g., middle of a contiguous run)
// - rayon::scope lets the policy control parallelism granularity
// - Policy can identify natural split points (e.g., gaps in position sequence)
//
// use std::sync::Mutex;
// use rayon;
//
// /// Context for parallel policy execution.
// /// Provides thread-safe result collection via Mutex.
// pub struct ParallelPolicyContext<'a, 's, T> {
// pub(crate) accessor: &'a BlockAccessor<'a>,
// pub(crate) scope: &'s rayon::Scope<'s>,
// pub(crate) results: &'a Mutex<Vec<T>>,
// }
//
// impl<'a, 's, T: Send> ParallelPolicyContext<'a, 's, T> {
// /// Get access to the block accessor.
// pub fn accessor(&self) -> &BlockAccessor<'a> {
// self.accessor
// }
//
// /// Yield a result item (thread-safe).
// pub fn yield_item(&self, item: T) {
// self.results.lock().unwrap().push(item);
// }
//
// /// Yield multiple result items (thread-safe, single lock acquisition).
// pub fn yield_items(&self, items: impl IntoIterator<Item = T>) {
// self.results.lock().unwrap().extend(items);
// }
//
// /// Spawn parallel work within the rayon scope.
// ///
// /// The closure receives the accessor and results mutex, allowing it to
// /// perform lookups and yield items from a separate thread.
// ///
// /// # Example
// /// ```ignore
// /// ctx.spawn(|accessor, results| {
// /// for hash in my_segment {
// /// if let Some(block) = accessor.find(hash) {
// /// results.lock().unwrap().push(block);
// /// }
// /// }
// /// });
// /// ```
// pub fn spawn<F>(&self, f: F)
// where
// F: FnOnce(&BlockAccessor, &Mutex<Vec<T>>) + Send + 'a,
// {
// let accessor = self.accessor;
// let results = self.results;
// self.scope.spawn(move |_| {
// f(accessor, results);
// });
// }
// }
// =============================================================================
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap;
use ::velo::Messenger;
use anyhow::Result;
use dashmap::DashMap;
use tokio::sync::{Mutex, mpsc, watch};
use uuid::Uuid;
use std::sync::Arc;
use crate::{
BlockId, G2, G3, InstanceId, SequenceHash, object::ObjectBlockOps, worker::RemoteDescriptor,
};
use kvbm_common::LogicalLayoutHandle;
use kvbm_logical::{
blocks::{BlockRegistry, ImmutableBlock},
manager::BlockManager,
};
use kvbm_physical::transfer::{TransferCompleteNotification, TransferOptions};
use kvbm_physical::manager::{LayoutHandle, SerializedLayout};
use super::{
super::worker::Worker,
super::worker::group::{ParallelWorkers, SpmdParallelWorkers},
AsyncSessionResult,
FindMatchesOptions,
FindMatchesResult,
Leader,
OnboardingStatus,
ReadyResult,
// Legacy SessionHandle for deferred operations
SessionHandle as LegacySessionHandle,
SessionId,
StagingMode,
accessor::{BlockAccessor, PolicyContext},
session::{
BlockHolder, ControlRole, ControllableSessionOptions, ControllableSessionResult,
InitiatorSession, MessageTransport, OnboardMessage, OnboardSessionTx, ResponderSession,
ServerSession, ServerSessionHandle, ServerSessionOptions, SessionHandle, SessionMessage,
SessionMessageTx, SessionPhase, create_server_session, session_handle_state_channel,
session_message_channel,
},
velo::{ExportMetadataCallback, VeloLeaderService},
};
/// Primary leader implementation for the distributed KVBM system.
///
/// `InstanceLeader` coordinates block onboarding across local and remote
/// instances. It owns a G2 (host memory) `BlockManager` and an optional G3
/// (disk) `BlockManager`, a set of workers for executing physical transfers,
/// and a parallel worker abstraction for multi-rank RDMA operations.
///
/// Key responsibilities:
/// - **Block matching**: finding which requested sequence hashes are already
/// cached locally (via `BlockAccessor` policies).
/// - **Session management**: creating, attaching, and driving onboard sessions
/// between endpoint (source) and controller (destination) roles.
/// - **Remote connectivity**: exchanging serialized layout metadata with peer
/// instances so workers can perform RDMA transfers.
/// - **Velo RPC**: registering handlers via `VeloLeaderService` so remote
/// leaders can initiate sessions and exchange metadata.
#[derive(Clone)]
pub struct InstanceLeader {
/// Nova instance for distributed communication.
messenger: Arc<Messenger>,
/// Block registry for deduplication.
#[allow(dead_code)]
pub(crate) registry: BlockRegistry,
/// G2 (host memory) block manager (wrapped in Arc since BlockManager doesn't implement Clone).
pub(crate) g2_manager: Arc<BlockManager<G2>>,
/// Optional G3 (disk) block manager
pub(crate) g3_manager: Option<Arc<BlockManager<G3>>>,
/// Workers for executing transfers (at least 1 required).
/// Multiple workers enable parallel transfers and redundancy.
workers: Vec<Arc<dyn Worker>>,
/// Parallel worker abstraction wrapping the workers.
/// Used for RDMA transfers with proper handle mapping storage.
parallel_worker: Option<Arc<dyn ParallelWorkers>>,
/// Map of active sessions (session_id -> message channel).
sessions: Arc<DashMap<SessionId, OnboardSessionTx>>,
/// Cached worker metadata (avoids querying workers repeatedly).
cached_worker_metadata: Option<Vec<SerializedLayout>>,
/// Map of session states for holding blocks alive (RAII).
session_states: Arc<DashMap<SessionId, SessionState>>,
/// List of remote leader instance IDs (mutable for post-construction configuration).
remote_leaders: Arc<std::sync::RwLock<Vec<InstanceId>>>,
/// Message transport for session communication.
transport: Arc<MessageTransport>,
// ========================================================================
// Unified Session Protocol
// ========================================================================
/// Map of session message receivers.
/// Used by SessionHandle/SessionEndpoint/ControllableSession.
session_sessions: Arc<DashMap<SessionId, SessionMessageTx>>,
// ========================================================================
// G4/Object Storage
// ========================================================================
/// Object storage client for G4 search and load operations.
/// Leader calls has_blocks on S3 directly, coordinates workers for get_blocks.
object_client: Option<Arc<dyn ObjectBlockOps>>,
}
/// Builder for InstanceLeader.
#[derive(Default)]
pub struct InstanceLeaderBuilder {
messenger: Option<Arc<Messenger>>,
registry: Option<BlockRegistry>,
g2_manager: Option<Arc<BlockManager<G2>>>,
g3_manager: Option<Arc<BlockManager<G3>>>,
workers: Vec<Arc<dyn Worker>>,
sessions: Option<Arc<DashMap<SessionId, OnboardSessionTx>>>,
remote_leaders: Option<Vec<InstanceId>>,
cached_worker_metadata: Option<Vec<SerializedLayout>>,
object_client: Option<Arc<dyn ObjectBlockOps>>,
}
impl InstanceLeaderBuilder {
/// Initialize builder with components from KvbmRuntime.
///
/// This extracts Nova from the runtime. Use this when the runtime
/// has already been constructed and you want the leader to share
/// the same Nova instance for distributed communication.
///
/// # Example
/// ```ignore
/// let runtime = KvbmRuntime::from_env_leader().await?;
/// let leader = InstanceLeaderBuilder::default()
/// .with_runtime(&runtime)
/// .g2_manager(g2_manager)
/// .build()?;
/// ```
pub fn with_runtime(self, runtime: &crate::KvbmRuntime) -> Self {
self.messenger(runtime.messenger().clone())
}
pub fn messenger(mut self, messenger: Arc<Messenger>) -> Self {
self.messenger = Some(messenger);
self
}
pub fn registry(mut self, registry: BlockRegistry) -> Self {
self.registry = Some(registry);
self
}
pub fn with_g2_manager(mut self, manager: Option<BlockManager<G2>>) -> Self {
self.g2_manager = manager.map(Arc::new);
self
}
pub fn with_g3_manager(mut self, manager: Option<BlockManager<G3>>) -> Self {
self.g3_manager = manager.map(Arc::new);
self
}
pub fn g2_manager(mut self, manager: Arc<BlockManager<G2>>) -> Self {
self.g2_manager = Some(manager);
self
}
pub fn g3_manager(mut self, manager: Arc<BlockManager<G3>>) -> Self {
self.g3_manager = Some(manager);
self
}
/// Add a single worker (convenience method).
pub fn worker(mut self, worker: Arc<dyn Worker>) -> Self {
self.workers.push(worker);
self
}
/// Set all workers at once.
pub fn workers(mut self, workers: Vec<Arc<dyn Worker>>) -> Self {
self.workers = workers;
self
}
pub fn remote_leaders(mut self, leaders: Vec<InstanceId>) -> Self {
self.remote_leaders = Some(leaders);
self
}
/// Cache worker metadata upfront to avoid querying workers later.
///
/// This is useful when workers have already exported metadata during initialization
/// (e.g., in the connector pattern where workers return metadata in their init response).
pub fn with_cached_worker_metadata(mut self, metadata: Vec<SerializedLayout>) -> Self {
self.cached_worker_metadata = Some(metadata);
self
}
/// Set the object storage client for G4 search and load operations.
///
/// The leader uses this client to:
/// - Query S3 for block presence via `has_blocks`
/// - Coordinate workers to load blocks from S3 via `get_blocks`
pub fn object_client(mut self, client: Arc<dyn ObjectBlockOps>) -> Self {
self.object_client = Some(client);
self
}
pub fn build(self) -> Result<InstanceLeader> {
let messenger = self
.messenger
.ok_or_else(|| anyhow::anyhow!("Nova instance required"))?;
let transport = Arc::new(MessageTransport::velo(messenger.clone()));
// Create event system for notification aggregation
let events = Arc::new(messenger.event_manager());
// Get current tokio runtime handle
let runtime = tokio::runtime::Handle::current();
// // Validate at least one worker
// if self.workers.is_empty() {
// anyhow::bail!("At least one worker required");
// }
// todo: we will need a common builder pattern for creating "general" parallel workers
// - we could also use an enum and match as the number of types will be limited
// Create parallel worker if workers are provided
let parallel_worker: Option<Arc<dyn ParallelWorkers>> = if !self.workers.is_empty() {
Some(Arc::new(SpmdParallelWorkers::new(
self.workers.to_vec(),
events.clone(),
runtime.clone(),
)))
} else {
None
};
Ok(InstanceLeader {
messenger,
registry: self
.registry
.ok_or_else(|| anyhow::anyhow!("block registry required"))?,
g2_manager: self
.g2_manager
.ok_or_else(|| anyhow::anyhow!("g2_manager required"))?,
g3_manager: self.g3_manager,
workers: self.workers,
parallel_worker,
cached_worker_metadata: self.cached_worker_metadata,
sessions: self.sessions.unwrap_or_else(|| Arc::new(DashMap::new())),
session_states: Arc::new(DashMap::new()),
remote_leaders: Arc::new(std::sync::RwLock::new(
self.remote_leaders.unwrap_or_default(),
)),
transport,
session_sessions: Arc::new(DashMap::new()),
object_client: self.object_client,
})
}
}
/// Internal session state for holding matched blocks.
#[allow(dead_code)] // Used for RAII block lifetime management
struct SessionState {
session_id: SessionId,
matched_g2_blocks: Vec<ImmutableBlock<G2>>,
matched_g3_blocks: Vec<ImmutableBlock<G3>>,
status_tx: watch::Sender<OnboardingStatus>,
}
/// Result of scanning for blocks across tiers.
///
/// Unlike `FindMatchesResult`, this scans all given hashes without stopping on first miss.
/// Returns blocks found in each tier along with their sorted positions.
pub struct ScanBlocksResult {
/// Blocks found in G2 (host memory).
pub g2_blocks: HashMap<SequenceHash, ImmutableBlock<G2>>,
/// Blocks found in G3 (disk).
pub g3_blocks: HashMap<SequenceHash, ImmutableBlock<G3>>,
/// All found blocks sorted by position (lowest to highest).
/// Each entry indicates which tier (G2/G3) the block was found in.
pub sorted_matches: Vec<(SequenceHash, LogicalLayoutHandle)>,
}
impl InstanceLeader {
/// Get a reference to the G2 BlockManager.
pub fn g2_manager(&self) -> &Arc<BlockManager<G2>> {
&self.g2_manager
}
/// Get a reference to the optional G3 BlockManager.
pub fn g3_manager(&self) -> Option<&Arc<BlockManager<G3>>> {
self.g3_manager.as_ref()
}
/// Get the block registry.
pub fn registry(&self) -> &BlockRegistry {
&self.registry
}
/// Get a reference to the Nova instance.
///
/// This provides access to the Nova distributed system for features
/// like event coordination and cross-instance communication.
pub fn messenger(&self) -> &Arc<Messenger> {
&self.messenger
}
/// Get the tokio runtime handle from Nova.
///
/// This handle should be used for spawning background tasks that need to
/// run on the KVBM runtime's executor (e.g., offload engine pipelines).
pub fn runtime(&self) -> tokio::runtime::Handle {
self.messenger.runtime().clone()
}
/// Check if a parallel_worker is configured.
///
/// The parallel_worker is required for local transfer operations
/// (e.g., offloading blocks between tiers).
pub fn has_parallel_worker(&self) -> bool {
self.parallel_worker.is_some()
}
/// Get the parallel worker for distributed operations.
///
/// The parallel worker fans out operations to all workers and aggregates results.
/// It implements `ObjectBlockOps` for coordinated object storage uploads.
pub fn parallel_worker(&self) -> Option<Arc<dyn ParallelWorkers>> {
self.parallel_worker.clone()
}
/// Get the object storage client for G4 operations.
///
/// Returns `Some` if object storage is configured, `None` otherwise.
/// The client is used by InitiatorSession for G4 parallel search.
pub fn object_client(&self) -> Option<Arc<dyn ObjectBlockOps>> {
self.object_client.clone()
}
/// Add a remote leader to the search list.
///
/// Remote leaders are queried during `find_matches_with_options` when
/// `search_remote == true`. This method allows adding remote leaders
/// after construction (e.g., when instance IDs are only known after
/// cluster setup).
pub fn add_remote_leader(&self, instance_id: InstanceId) {
let mut remote_leaders = self.remote_leaders.write().unwrap();
if !remote_leaders.contains(&instance_id) {
remote_leaders.push(instance_id);
}
}
/// Set all remote leaders at once.
pub fn set_remote_leaders(&self, instance_ids: Vec<InstanceId>) {
let mut remote_leaders = self.remote_leaders.write().unwrap();
*remote_leaders = instance_ids;
}
/// Get the list of remote leader instance IDs.
pub fn remote_leaders(&self) -> Vec<InstanceId> {
self.remote_leaders.read().unwrap().clone()
}
/// Scan for all blocks matching any of the given sequence hashes.
///
/// Unlike `find_matches`, this:
/// - Does NOT stop on first miss
/// - Returns blocks from both G2 and G3 tiers separately
/// - Acquires blocks from pools (caller owns until dropped via RAII)
/// - Returns `sorted_matches` ordered by `SequenceHash::position()`
///
/// # Arguments
/// * `sequence_hashes` - Hashes to scan for
/// * `touch` - Whether to update frequency tracking (for MultiLRU eviction policy)
///
/// # Algorithm
/// 1. Scan G2 manager for candidates
/// 2. Scan G3 manager for remaining candidates
/// 3. Build sorted_matches from both, sorted by position (lowest to highest)
pub fn scan_blocks(&self, sequence_hashes: &[SequenceHash], touch: bool) -> ScanBlocksResult {
// Step 1: Scan G2 for all candidates
let g2_blocks = self.g2_manager.scan_matches(sequence_hashes, touch);
// Step 2: Find remaining hashes not in G2
let remaining: Vec<SequenceHash> = sequence_hashes
.iter()
.filter(|h| !g2_blocks.contains_key(h))
.copied()
.collect();
// Step 3: Scan G3 for remaining (if G3 exists)
let g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
if !remaining.is_empty() {
g3_manager.scan_matches(&remaining, touch)
} else {
HashMap::new()
}
} else {
HashMap::new()
};
// Step 4: Build sorted_matches from both tiers
let mut sorted_matches: Vec<(SequenceHash, LogicalLayoutHandle)> =
Vec::with_capacity(g2_blocks.len() + g3_blocks.len());
// Add G2 matches
for hash in g2_blocks.keys() {
sorted_matches.push((*hash, LogicalLayoutHandle::G2));
}
// Add G3 matches
for hash in g3_blocks.keys() {
sorted_matches.push((*hash, LogicalLayoutHandle::G3));
}
// Sort by SequenceHash position (lowest to highest)
sorted_matches.sort_by_key(|(hash, _)| hash.position());
ScanBlocksResult {
g2_blocks,
g3_blocks,
sorted_matches,
}
}
/// Scan blocks using a custom policy that controls iteration and yields results.
///
/// This provides maximum flexibility for implementing custom scanning strategies.
/// The policy receives access to a `BlockAccessor` for acquiring blocks and a
/// `PolicyContext` for yielding results incrementally.
///
/// # Arguments
/// * `hashes` - Sequence hashes to scan
/// * `touch` - Whether to update frequency tracking on block access
/// * `policy` - Function that implements the scanning strategy
///
/// # Design
///
/// The accessor does NOT hold locks between calls. Each `.find()` call is
/// independent. This enables:
/// - Custom iteration patterns (sorted, BTree scan, binary search, etc.)
/// - Yielding results incrementally (e.g., contiguous subsequences)
/// - Future parallel execution (accessor is Send + Sync)
///
/// # Example: Simple linear scan
/// ```ignore
/// let blocks = leader.scan_with_policy(&hashes, true, |hashes, ctx| {
/// for hash in hashes {
/// if let Some(block) = ctx.accessor().find(*hash) {
/// ctx.yield_item(block);
/// }
/// }
/// });
/// ```
///
/// # Example: Find contiguous subsequences
/// ```ignore
/// let runs: Vec<Vec<TieredBlock>> = leader.scan_with_policy(&hashes, true, |hashes, ctx| {
/// let mut run = Vec::new();
/// let mut last_pos: Option<u64> = None;
///
/// for hash in hashes.iter().sorted_by_key(|h| h.position()) {
/// if let Some(block) = ctx.accessor().find(*hash) {
/// let pos = block.position();
/// if last_pos.map_or(true, |p| pos == p + 1) {
/// run.push(block);
/// } else {
/// if !run.is_empty() { ctx.yield_item(std::mem::take(&mut run)); }
/// run.push(block);
/// }
/// last_pos = Some(pos);
/// } else if !run.is_empty() {
/// ctx.yield_item(std::mem::take(&mut run));
/// last_pos = None;
/// }
/// }
/// if !run.is_empty() { ctx.yield_item(run); }
/// });
/// ```
pub fn scan_with_policy<F, T>(&self, hashes: &[SequenceHash], touch: bool, policy: F) -> Vec<T>
where
F: FnOnce(&[SequenceHash], &mut PolicyContext<T>),
{
let accessor = BlockAccessor::new(self, touch);
let mut ctx = PolicyContext {
accessor,
results: Vec::new(),
};
policy(hashes, &mut ctx);
ctx.results
}
pub fn builder() -> InstanceLeaderBuilder {
InstanceLeaderBuilder::default()
}
/// Register Nova handlers for leader-to-leader communication.
///
/// This must be called after construction to enable distributed onboarding.
pub fn register_handlers(&self) -> Result<()> {
let instance_id = self.messenger.instance_id();
let g2_manager = self.g2_manager.clone();
let g3_manager = self.g3_manager.clone();
let parallel_worker = self.parallel_worker.clone();
let transport = self.transport.clone();
let sessions = self.sessions.clone();
let spawn_responder = move |msg: OnboardMessage| -> Result<()> {
if let OnboardMessage::CreateSession {
requester,
session_id,
sequence_hashes,
} = msg
{
let (tx, rx) = mpsc::channel(100);
sessions.insert(session_id, tx);
let session = ResponderSession::new(
session_id,
instance_id,
requester,
g2_manager.clone(),
g3_manager.clone(),
parallel_worker.clone(),
transport.clone(),
);
tokio::spawn(async move {
if let Err(e) = session.run(rx, sequence_hashes).await {
tracing::warn!(error = %e, "ResponderSession error");
}
});
Ok(())
} else {
anyhow::bail!("spawn_responder called with non-CreateSession message")
}
};
// Create export_metadata callback if we have workers or cached metadata
let export_metadata_callback: Option<ExportMetadataCallback> =
if !self.workers.is_empty() || self.cached_worker_metadata.is_some() {
let workers = self.workers.clone();
let cached_metadata = self.cached_worker_metadata.clone();
Some(Arc::new(move || {
let workers = workers.clone();
let cached_metadata = cached_metadata.clone();
Box::pin(async move {
// Return cached metadata if available
if let Some(cached) = cached_metadata {
return Ok(cached);
}
// Otherwise, query workers
let mut metadata = Vec::with_capacity(workers.len());
for worker in &workers {
let serialized = worker.export_metadata()?.await?;
metadata.push(serialized);
}
Ok(metadata)
})
}))
} else {
None
};
let mut service = VeloLeaderService::new(self.messenger.clone(), self.sessions.clone())
.with_spawn_responder(spawn_responder)
.with_session_sessions(self.session_sessions.clone());
if let Some(callback) = export_metadata_callback {
service = service.with_export_metadata(callback);
}
service.register_handlers()?;
Ok(())
}
/// Store session state (held blocks and status channel).
///
/// Blocks are kept alive via RAII until the session is removed from storage.
fn store_session_state(&self, state: SessionState) {
self.session_states.insert(state.session_id, state);
}
/// Release a completed session, dropping any held blocks.
///
/// This is optional - sessions will naturally be cleaned up when the InstanceLeader
/// is dropped. Call this explicitly if you need to release blocks earlier.
pub fn release_session(&self, session_id: SessionId) {
self.session_states.remove(&session_id);
self.sessions.remove(&session_id);
self.session_sessions.remove(&session_id);
}
// ========================================================================
// Inverted Control Pattern (Prefill-Decode) Methods
// ========================================================================
/// Create a controllable session for local blocks.
///
/// This is the "Decode side" of the inverted control pattern:
/// 1. Search local G2 and G3 for matches
/// 2. Create a ControllableSession that holds the blocks
/// 3. Return session_id to be sent to Prefill out-of-band
///
/// By default, G3→G2 staging starts immediately (auto_stage=true).
pub fn create_controllable_session(
&self,
sequence_hashes: &[SequenceHash],
) -> Result<ControllableSessionResult> {
self.create_controllable_session_with_options(
sequence_hashes,
ControllableSessionOptions::default(),
)
}
/// Create a controllable session with custom options.
///
/// Use this when you need to control auto-staging behavior.
pub fn create_controllable_session_with_options(
&self,
sequence_hashes: &[SequenceHash],
options: ControllableSessionOptions,
) -> Result<ControllableSessionResult> {
let session_id = SessionId::from(Uuid::new_v4());
// Local search only
let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
// Find remaining hashes not in G2
let remaining_hashes: Vec<_> = sequence_hashes
.iter()
.filter(|h| !matched_g2_blocks.iter().any(|b| b.sequence_hash() == **h))
.copied()
.collect();
// Search G3 for remaining hashes
let matched_g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
g3_manager.match_blocks(&remaining_hashes)
} else {
Vec::new()
};
let local_g2_count = matched_g2_blocks.len();
let local_g3_count = matched_g3_blocks.len();
// Create session channel using unified SessionMessage protocol
let (tx, rx) = session_message_channel(100);
self.session_sessions.insert(session_id, tx);
// Collect G2 layout handles from workers for round-robin block allocation
let worker_g2_handles: Vec<LayoutHandle> = self
.parallel_worker
.as_ref()
.map(|pw| pw.workers().iter().filter_map(|w| w.g2_handle()).collect())
.unwrap_or_default();
let endpoint = super::session::SessionEndpoint::new(
session_id,
self.messenger.instance_id(),
self.transport.clone(),
rx,
);
let (cmd_tx, cmd_rx) = mpsc::channel(16);
let session = ServerSession::new_with_staging(
endpoint,
BlockHolder::new(matched_g2_blocks),
BlockHolder::new(matched_g3_blocks),
worker_g2_handles,
self.g2_manager.clone(),
self.parallel_worker.clone(),
cmd_rx,
ServerSessionOptions {
auto_stage: options.auto_stage,
},
);
// Keep handle alive to prevent cmd channel from closing
let _handle = ServerSessionHandle::new(session_id, self.messenger.instance_id(), cmd_tx);
// Spawn session task
let session_sessions = self.session_sessions.clone();
tokio::spawn(async move {
let _handle = _handle; // move handle into task to keep cmd channel open
if let Err(e) = session.run().await {
tracing::warn!(error = %e, "ServerSession error");
}
// Clean up when session completes
session_sessions.remove(&session_id);
});
Ok(ControllableSessionResult {
session_id,
local_g2_count,
local_g3_count,
})
}
// ========================================================================
// Unified Session Protocol
// ========================================================================
/// Attach to a remote session.
/// Returns a `SessionHandle` that uses `SessionMessage` for communication.
///
/// # Arguments
/// * `remote_instance` - The instance hosting the session
/// * `session_id` - The session to attach to
///
/// # Example
/// ```ignore
/// let handle = leader.attach_session(remote_id, session_id).await?;
/// let state = handle.wait_for_ready().await?;
/// handle.trigger_staging().await?;
/// ```
pub async fn attach_session(
&self,
remote_instance: InstanceId,
session_id: SessionId,
) -> Result<SessionHandle> {
// Create local channel for receiving state updates
let (state_tx, state_rx) = session_handle_state_channel();
// Register handler for this session's messages
let (msg_tx, msg_rx) = session_message_channel(100);
self.session_sessions.insert(session_id, msg_tx);
// Spawn receiver task to update state
tokio::spawn(Self::run_session_receiver(msg_rx, state_tx));
// Send attach message using new protocol
let msg = SessionMessage::Attach {
peer: self.messenger.instance_id(),
session_id,
as_role: ControlRole::Controller,
};
self.transport.send_session(remote_instance, msg).await?;
let mut handle = SessionHandle::new(
session_id,
remote_instance,
self.messenger.instance_id(),
self.transport.clone(),
state_rx,
);
// Add RDMA support if parallel worker is configured
if let Some(parallel_worker) = &self.parallel_worker {
handle = handle.with_rdma_support(parallel_worker.clone());
}
Ok(handle)
}
// ========================================================================
// Endpoint Session Creation (Server-Side)
// ========================================================================
/// Create an endpoint session that a remote peer can attach to.
///
/// This searches local G2/G3 for blocks matching the given sequence hashes
/// and creates a session that exposes them for remote RDMA pull.
///
/// Returns `(session_id, handle)` where:
/// - `session_id` - Send to remote peer for attachment
/// - `handle` - Use to control the session (send layer notifications, close)
///
/// # Example
/// ```ignore
/// // Create session for sequence hashes
/// let (session_id, handle) = leader.create_endpoint_session(&hashes)?;
///
/// // Send session_id to remote peer out-of-band
/// // Remote attaches via: remote_leader.attach_session(local_id, session_id)
///
/// // For layerwise transfer, notify when layers are ready
/// handle.notify_layers_ready(0..1).await?;
/// ```
pub fn create_endpoint_session(
&self,
sequence_hashes: &[SequenceHash],
) -> Result<(SessionId, ServerSessionHandle)> {
let session_id = SessionId::from(uuid::Uuid::new_v4());
// Local search
let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
// Collect layout handles from workers
// Note: For single-worker setups, all blocks use the same handle
// For multi-worker (SPMD), each block gets the handle from its assigned worker
let worker_g2_handles: Vec<LayoutHandle> = self
.parallel_worker
.as_ref()
.map(|pw| pw.workers().iter().filter_map(|w| w.g2_handle()).collect())
.unwrap_or_default();
// Assign layout handle to each matched block
// For now, use the first worker's handle for all blocks (single-worker assumption)
// TODO: For SPMD, map blocks to worker handles based on block assignment
let layout_handle = worker_g2_handles
.first()
.copied()
.ok_or_else(|| anyhow::anyhow!("No G2 layout handle available from workers"))?;
let layout_handles: Vec<LayoutHandle> = vec![layout_handle; matched_g2_blocks.len()];
// Get sequence hashes from matched blocks
let matched_hashes: Vec<SequenceHash> = matched_g2_blocks
.iter()
.map(|b| b.sequence_hash())
.collect();
// Create the session channel
let (msg_tx, msg_rx) = session_message_channel(100);
self.session_sessions.insert(session_id, msg_tx);
// Create BlockHolder from matched blocks
let block_holder = BlockHolder::new(matched_g2_blocks);
// Create the session and handle
let (session, handle) = create_server_session(
session_id,
self.messenger.instance_id(),
block_holder,
layout_handles,
matched_hashes,
self.transport.clone(),
msg_rx,
);
// Spawn the session task
let session_sessions = self.session_sessions.clone();
tokio::spawn(async move {
if let Err(e) = session.run().await {
tracing::warn!(error = %e, "ServerSession error");
}
// Clean up when session completes
session_sessions.remove(&session_id);
});
Ok((session_id, handle))
}
/// Create an endpoint session for specific pre-allocated blocks.
///
/// Unlike `create_endpoint_session`, this doesn't search - it uses the
/// provided blocks directly. Useful when the caller already has blocks
/// to expose (e.g., after prefill computation).
///
/// # Arguments
/// * `blocks` - Blocks to expose for RDMA pull
/// * `sequence_hashes` - Sequence hashes for the blocks (must match block count)
/// * `layout_handles` - Layout handles for the blocks (must match block count)
///
/// # Example
/// ```ignore
/// // After prefill computation, expose blocks for Decode to pull
/// let (session_id, handle) = leader.create_endpoint_session_for_blocks(
/// prefill_blocks,
/// &hashes,
/// &layout_handles,
/// )?;
/// ```
pub fn create_endpoint_session_for_blocks(
&self,
blocks: BlockHolder<G2>,
sequence_hashes: &[SequenceHash],
layout_handles: &[LayoutHandle],
) -> Result<(SessionId, ServerSessionHandle)> {
let session_id = SessionId::from(uuid::Uuid::new_v4());
// Create the session channel
let (msg_tx, msg_rx) = session_message_channel(100);
self.session_sessions.insert(session_id, msg_tx);
// Create the session and handle
let (session, handle) = create_server_session(
session_id,
self.messenger.instance_id(),
blocks,
layout_handles.to_vec(),
sequence_hashes.to_vec(),
self.transport.clone(),
msg_rx,
);
// Spawn the session task
let session_sessions = self.session_sessions.clone();
tokio::spawn(async move {
if let Err(e) = session.run().await {
tracing::warn!(error = %e, "ServerSession error");
}
// Clean up when session completes
session_sessions.remove(&session_id);
});
Ok((session_id, handle))
}
/// Internal: Process incoming SessionMessage for a session.
async fn run_session_receiver(
mut rx: mpsc::Receiver<SessionMessage>,
state_tx: super::session::SessionHandleStateTx,
) {
while let Some(msg) = rx.recv().await {
match msg {
SessionMessage::StateResponse { state, .. } => {
state_tx.update(state);
}
SessionMessage::BlocksStaged {
staged_blocks,
remaining,
layer_range,
..
} => {
state_tx.add_staged_blocks(staged_blocks, remaining, layer_range);
}
SessionMessage::Error { message, .. } => {
tracing::warn!(%message, "Session error");
state_tx.set_failed();
break;
}
SessionMessage::Close { .. } => {
state_tx.set_phase(SessionPhase::Complete);
break;
}
_ => {
// Ignore control commands (sent by controller, not received)
}
}
}
}
/// Get the session sessions map (for Nova handler registration).
#[expect(dead_code)]
pub(crate) fn session_sessions(&self) -> Arc<DashMap<SessionId, SessionMessageTx>> {
self.session_sessions.clone()
}
// ========================================================================
// RDMA Metadata Management
// These methods handle layout metadata export/import for remote RDMA transfers.
// ========================================================================
/// Check if metadata for a remote instance has been loaded.
///
/// Returns true if `import_remote_metadata` has been successfully called
/// for the given instance.
pub fn has_remote_metadata(&self, instance: InstanceId) -> bool {
self.parallel_worker
.as_ref()
.map(|pw| pw.has_remote_metadata(instance))
.unwrap_or(false)
}
/// Get the number of workers attached to this leader.
pub fn worker_count(&self) -> usize {
self.workers.len()
}
/// Export metadata from all workers.
///
/// Returns a `Vec<SerializedLayout>` where each element corresponds to a worker
/// in rank order. This metadata can be sent to remote instances to enable
/// RDMA transfers.
///
/// # Returns
/// Vector of serialized layouts, one per worker
pub async fn export_worker_metadata(&self) -> Result<Vec<SerializedLayout>> {
// Return cached metadata if available
if let Some(cached) = &self.cached_worker_metadata {
return Ok(cached.clone());
}
// Otherwise, query workers
let mut metadata = Vec::with_capacity(self.workers.len());
for worker in &self.workers {
let serialized = worker.export_metadata()?.await?;
metadata.push(serialized);
}
Ok(metadata)
}
/// Import metadata from a remote instance's workers.
///
/// This imports layout metadata from a remote instance, enabling RDMA transfers
/// to pull data from that instance. Metadata is imported rank-by-rank:
/// - local worker 0 imports remote worker 0's metadata
/// - local worker 1 imports remote worker 1's metadata
/// - etc.
///
/// # Arguments
/// * `remote_instance` - The instance ID of the remote leader
/// * `metadata` - Vector of SerializedLayout from remote workers (one per worker)
///
/// # Errors
/// Returns an error if:
/// - No parallel worker configured
/// - Metadata was already imported for this instance
/// - Worker count mismatch between local and remote
/// - Individual worker metadata import fails
pub async fn import_remote_metadata(
&self,
remote_instance: InstanceId,
metadata: Vec<SerializedLayout>,
) -> Result<()> {
let parallel_worker = self
.parallel_worker
.as_ref()
.ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
// Check if already loaded
if parallel_worker.has_remote_metadata(remote_instance) {
anyhow::bail!("Metadata already imported for instance {}", remote_instance);
}
// Connect to remote - this imports metadata and stores handle mappings
parallel_worker
.connect_remote(remote_instance, metadata)?
.await?;
Ok(())
}
// ========================================================================
// Private Worker Mirror Methods
// These methods execute operations across all workers and aggregate results.
// ========================================================================
/// Execute local transfer across all workers, returning aggregated notification.
///
/// Delegates to the parallel_worker which fans out to all workers and
/// aggregates their notifications into a single composite notification.
#[allow(dead_code)]
pub(crate) fn execute_local_transfer(
&self,
src: LogicalLayoutHandle,
dst: LogicalLayoutHandle,
src_block_ids: Vec<BlockId>,
dst_block_ids: Vec<BlockId>,
options: TransferOptions,
) -> Result<TransferCompleteNotification> {
let parallel_worker = self
.parallel_worker
.as_ref()
.ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
parallel_worker.execute_local_transfer(
src,
dst,
Arc::from(src_block_ids),
Arc::from(dst_block_ids),
options,
)
}
/// Execute remote onboard across all workers, returning aggregated notification.
///
/// Delegates to the parallel_worker which fans out to all workers and
/// aggregates their notifications into a single composite notification.
#[allow(dead_code)]
pub(crate) fn execute_remote_onboard(
&self,
src: RemoteDescriptor,
dst: LogicalLayoutHandle,
dst_block_ids: Vec<BlockId>,
options: TransferOptions,
) -> Result<TransferCompleteNotification> {
let parallel_worker = self
.parallel_worker
.as_ref()
.ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
parallel_worker.execute_remote_onboard(src, dst, Arc::from(dst_block_ids), options)
}
/// Execute remote offload across all workers, returning aggregated notification.
///
/// Delegates to the parallel_worker which fans out to all workers and
/// aggregates their notifications into a single composite notification.
#[allow(dead_code)]
pub(crate) fn execute_remote_offload(
&self,
src: LogicalLayoutHandle,
dst: RemoteDescriptor,
src_block_ids: Vec<BlockId>,
options: TransferOptions,
) -> Result<TransferCompleteNotification> {
let parallel_worker = self
.parallel_worker
.as_ref()
.ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
parallel_worker.execute_remote_offload(src, Arc::from(src_block_ids), dst, options)
}
}
impl Leader for InstanceLeader {
fn find_matches_with_options(
&self,
sequence_hashes: &[SequenceHash],
options: FindMatchesOptions,
) -> Result<FindMatchesResult> {
// Search G2 (host memory) for matches
// Uses match_blocks which stops at first miss (implements "first hole" policy).
// This ensures we only find contiguous blocks from the start of the sequence.
// For distributed search, remote instances use scan_matches for broad coverage,
// then first-hole filtering is applied in InitiatorSession after aggregation.
// todo: add explicit timing tracing here
// let start_time = Instant::now();
let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
//let g2_search_time = Instant::now().duration_since(start_time);
// Search G3 (disk) for remaining hashes if G3 is available
let remaining_hashes: Vec<_> = sequence_hashes
.iter()
.filter(|h| !matched_g2_blocks.iter().any(|b| b.sequence_hash() == **h))
.copied()
.collect();
let matched_g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
// Uses match_blocks on remaining hashes (those not found in G2).
// Since G2 already applied first-hole policy, G3 search continues from where G2 stopped.
g3_manager.match_blocks(&remaining_hashes)
} else {
Vec::new()
};
// Determine if we can return immediately (Ready) or need async session
// Ready if:
// - g3 blocks is empty
// - AND NOT (search_remote AND has_remote_leaders)
// - AND NOT (search_remote AND has_object_client)
//
// AsyncSession (is_ready=false) if:
// - g3 is not empty, or
// - search_remote is true AND (has_remote_leaders OR has_object_client)
let has_remote_leaders = !self.remote_leaders.read().unwrap().is_empty();
let has_object_client = self.object_client.is_some();
let needs_remote_search =
options.search_remote && (has_remote_leaders || has_object_client);
let is_ready = matched_g3_blocks.is_empty() && !needs_remote_search;
if is_ready {
// No session needed - blocks owned directly by ReadyResult (RAII)
return Ok(FindMatchesResult::Ready(ReadyResult::new(
matched_g2_blocks,
)));
}
// AsyncSession path: G3 blocks found or remote search enabled
let session_id = SessionId::from(Uuid::new_v4());
let local_g2_count = matched_g2_blocks.len();
let local_g3_count = matched_g3_blocks.len();
// AsyncSession: staging locally and/or remote searching
let (status_tx, status_rx) = watch::channel(OnboardingStatus::Searching);
let all_g2_blocks = Arc::new(Mutex::new(None));
// Store session state to keep blocks alive
let state = SessionState {
session_id,
matched_g2_blocks,
matched_g3_blocks,
status_tx: status_tx.clone(),
};
self.store_session_state(state);
// If no remote search, handle local-only staging
if !options.search_remote {
// Local-only staging (Prepare or Full mode)
// TODO: Implement local G3→G2 staging
let total_matched = local_g2_count + local_g3_count;
status_tx
.send(OnboardingStatus::Complete {
matched_blocks: total_matched,
})
.ok();
return Ok(FindMatchesResult::AsyncSession(AsyncSessionResult::new(
session_id,
status_rx,
all_g2_blocks,
None, // No session handle for local-only staging (yet)
)));
}
// Remote search path
let (tx, rx) = mpsc::channel(100);
self.sessions.insert(session_id, tx);
// Create control channel for Hold/Prepare modes
let (session_handle, control_rx) = if matches!(
options.staging_mode,
StagingMode::Hold | StagingMode::Prepare
) {
let (control_tx, control_rx) = mpsc::channel(10);
let handle = LegacySessionHandle::new(session_id, options.staging_mode, control_tx);
(Some(handle), Some(control_rx))
} else {
(None, None)
};
let session = InitiatorSession::new(
session_id,
self.messenger.instance_id(),
options.staging_mode,
self.g2_manager.clone(),
self.g3_manager.clone(),
self.parallel_worker.clone(),
self.transport.clone(),
status_tx.clone(),
all_g2_blocks.clone(),
control_rx.unwrap_or_else(|| {
let (_, rx) = mpsc::channel(1);
rx
}),
self.object_client.clone(),
);
let remote_leaders = self.remote_leaders.read().unwrap().clone();
let sequence_hashes = sequence_hashes.to_vec();
let handle = self.messenger.runtime();
handle.spawn(async move {
if let Err(e) = session.run(rx, remote_leaders, sequence_hashes).await {
tracing::warn!(error = %e, "InitiatorSession error");
// Try to update status to indicate error
status_tx
.send(OnboardingStatus::Complete { matched_blocks: 0 })
.ok();
}
});
Ok(FindMatchesResult::AsyncSession(AsyncSessionResult::new(
session_id,
status_rx,
all_g2_blocks,
session_handle,
)))
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment