feat: adding kvbm-engine (#6773)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: adding kvbm-engine (#6773)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
008683d6 · Ryan Olson · GitHub · cf79c4fc · 008683d6 · 008683d6
Unverified Commit 008683d6 authored Apr 08, 2026 by Ryan Olson Committed by GitHub Apr 08, 2026
20 changed files
--- a/lib/kvbm-engine/bin/bench_engine.rs
+++ b/lib/kvbm-engine/bin/bench_engine.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! KVBM transfer bandwidth benchmark with full Leader+Worker architecture.
+//!
+//! Uses production-fidelity InstanceLeader, VeloWorkerService/Client, SpmdParallelWorkers,
+//! and optionally OffloadEngine pipelines. Each worker runs on a NUMA-pinned thread with
+//! its own tokio runtime and NixlAgent.
+//!
+//! # Usage
+//! ```bash
+//! # Direct transfer benchmark:
+//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
+//!     --devices 0 --page-sizes 32,64 --concurrency 1,2 --iterations 10 --skip-disk --skip-gds
+//!
+//! # With offload pipeline:
+//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
+//!     --devices 0 --page-sizes 64 --concurrency 1 --iterations 10 --skip-disk --skip-gds \
+//!     --offload --offload-batch-sizes 8,32 --offload-concurrency 1,2
+//!
+//! # Multi-GPU:
+//! cargo run -p kvbm-engine --features bench --bin bench_engine -- \
+//!     --devices 0,1 --page-sizes 128 --concurrency 1,2,4 --iterations 50
+//! ```
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use anyhow::{Result, ensure};
+use clap::Parser;
+use figment::Figment;
+use figment::providers::{Env, Format, Serialized, Toml};
+use serde::{Deserialize, Serialize};
+
+use kvbm_engine::{
+    BlockId, G1, G2, G3, LogicalLayoutHandle,
+    leader::InstanceLeader,
+    offload::{ExternalBlock, OffloadEngine, PipelineBuilder, PresenceFilter, SourceBlocks},
+    testing::{
+        TestManagerBuilder, TestRegistryBuilder, create_messenger_tcp,
+        managers::populate_manager_with_blocks, token_blocks,
+    },
+    worker::{DirectWorker, Worker, WorkerTransfers},
+};
+use kvbm_logical::blocks::BlockRegistry;
+use kvbm_logical::manager::BlockManager;
+use kvbm_physical::layout::{LayoutConfig, PhysicalLayout};
+use kvbm_physical::transfer::{NixlAgent, TransferManager, TransferOptions};
+
+// ─── CLI ───────────────────────────────────────────────────────────────────────
+
+#[derive(Parser)]
+#[command(
+    name = "bench_engine",
+    about = "KVBM transfer bandwidth benchmark (leader+worker architecture)"
+)]
+struct Cli {
+    /// GPU device IDs (comma-separated)
+    #[arg(long, value_delimiter = ',', default_value = "0")]
+    devices: Vec<u32>,
+
+    /// Tokens-per-block values to sweep
+    #[arg(long, value_delimiter = ',', default_values_t = vec![32, 64, 128, 256])]
+    page_sizes: Vec<usize>,
+
+    /// Concurrency levels to sweep
+    #[arg(long, value_delimiter = ',', default_values_t = vec![1, 2, 4, 8])]
+    concurrency: Vec<usize>,
+
+    /// Blocks per transfer batch
+    #[arg(long, default_value_t = 8)]
+    blocks_per_batch: usize,
+
+    /// Total blocks per pool (must be >= max_concurrency * blocks_per_batch * 2)
+    #[arg(long, default_value_t = 128)]
+    num_blocks: usize,
+
+    /// Number of KV-cache layers
+    #[arg(long, default_value_t = 24)]
+    num_layers: usize,
+
+    /// Inner dimension (hidden_dim / tp_size)
+    #[arg(long, default_value_t = 4096)]
+    inner_dim: usize,
+
+    /// Bounce buffer block counts to sweep (tail blocks of G2 used as bounce for staged G1↔G3)
+    #[arg(long, value_delimiter = ',', default_values_t = vec![2, 4, 8])]
+    bounce_blocks: Vec<usize>,
+
+    /// Warmup iterations
+    #[arg(long, default_value_t = 5)]
+    warmup: usize,
+
+    /// Measurement iterations per test
+    #[arg(long, default_value_t = 50)]
+    iterations: usize,
+
+    /// Disk path for G3 layouts (default: tempdir)
+    #[arg(long)]
+    disk_path: Option<PathBuf>,
+
+    /// Skip G3/disk tests
+    #[arg(long)]
+    skip_disk: bool,
+
+    /// Skip GDS tests
+    #[arg(long)]
+    skip_gds: bool,
+
+    /// Run only isolated (phase 1) tests
+    #[arg(long)]
+    isolated_only: bool,
+
+    /// Run only bidirectional (phase 2) tests
+    #[arg(long)]
+    bidir_only: bool,
+
+    /// Enable offload pipeline benchmarks (phase 3)
+    #[arg(long)]
+    offload: bool,
+
+    /// Offload pipeline batch sizes to sweep
+    #[arg(long, value_delimiter = ',', default_values_t = vec![8, 16, 32, 64])]
+    offload_batch_sizes: Vec<usize>,
+
+    /// Max concurrent transfers for offload pipeline
+    #[arg(long, value_delimiter = ',', default_values_t = vec![1, 2, 4])]
+    offload_concurrency: Vec<usize>,
+
+    /// Base directory for output (default: current directory)
+    #[arg(long, short)]
+    output: Option<PathBuf>,
+
+    /// Optional TOML config file (overridden by CLI args)
+    #[arg(long)]
+    config: Option<PathBuf>,
+}
+
+// ─── Config ────────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+struct BenchConfig {
+    devices: Vec<u32>,
+    page_sizes: Vec<usize>,
+    concurrency: Vec<usize>,
+    blocks_per_batch: usize,
+    num_blocks: usize,
+    num_layers: usize,
+    inner_dim: usize,
+    bounce_blocks: Vec<usize>,
+    warmup: usize,
+    iterations: usize,
+    disk_path: Option<PathBuf>,
+    skip_disk: bool,
+    skip_gds: bool,
+    isolated_only: bool,
+    bidir_only: bool,
+    offload: bool,
+    offload_batch_sizes: Vec<usize>,
+    offload_concurrency: Vec<usize>,
+    output: Option<PathBuf>,
+}
+
+impl From<Cli> for BenchConfig {
+    fn from(cli: Cli) -> Self {
+        Self {
+            devices: cli.devices,
+            page_sizes: cli.page_sizes,
+            concurrency: cli.concurrency,
+            blocks_per_batch: cli.blocks_per_batch,
+            num_blocks: cli.num_blocks,
+            num_layers: cli.num_layers,
+            inner_dim: cli.inner_dim,
+            bounce_blocks: cli.bounce_blocks,
+            warmup: cli.warmup,
+            iterations: cli.iterations,
+            disk_path: cli.disk_path,
+            skip_disk: cli.skip_disk,
+            skip_gds: cli.skip_gds,
+            isolated_only: cli.isolated_only,
+            bidir_only: cli.bidir_only,
+            offload: cli.offload,
+            offload_batch_sizes: cli.offload_batch_sizes,
+            offload_concurrency: cli.offload_concurrency,
+            output: cli.output,
+        }
+    }
+}
+
+fn build_config(cli: Cli) -> Result<BenchConfig> {
+    let cli_config = BenchConfig::from(cli);
+
+    // Check for TOML config file from environment
+    let config_path: Option<PathBuf> = std::env::var("KVBM_BENCH_CONFIG").ok().map(PathBuf::from);
+
+    let mut figment = Figment::new().merge(Serialized::defaults(&cli_config));
+
+    if let Some(path) = config_path {
+        figment = figment.merge(Toml::file(path));
+    }
+
+    figment = figment
+        .merge(Env::prefixed("KVBM_BENCH_"))
+        .merge(Serialized::defaults(&cli_config)); // CLI wins
+
+    Ok(figment.extract()?)
+}
+
+// ─── Results ───────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Serialize)]
+struct LatencyStats {
+    min_us: f64,
+    max_us: f64,
+    mean_us: f64,
+    p50_us: f64,
+    p95_us: f64,
+    p99_us: f64,
+}
+
+impl LatencyStats {
+    fn from_durations(mut durations: Vec<Duration>) -> Self {
+        durations.sort();
+        let n = durations.len();
+        let sum: Duration = durations.iter().sum();
+
+        Self {
+            min_us: durations[0].as_secs_f64() * 1e6,
+            max_us: durations[n - 1].as_secs_f64() * 1e6,
+            mean_us: sum.as_secs_f64() * 1e6 / n as f64,
+            p50_us: durations[n / 2].as_secs_f64() * 1e6,
+            p95_us: durations[(n as f64 * 0.95) as usize].as_secs_f64() * 1e6,
+            p99_us: durations[(n as f64 * 0.99) as usize].as_secs_f64() * 1e6,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize)]
+struct BenchResult {
+    test: String,
+    device_id: u32,
+    page_size: usize,
+    blocks_per_batch: usize,
+    concurrency: usize,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    bounce_blocks: Option<usize>,
+    bytes_per_iter: usize,
+    iterations: usize,
+    latency_us: LatencyStats,
+    bandwidth_gbs: f64,
+    aggregate_bandwidth_gbs: f64,
+}
+
+fn compute_bytes_per_block(config: &BenchConfig, page_size: usize) -> usize {
+    config.num_layers * 2 * page_size * config.inner_dim * 2
+}
+
+fn make_result(
+    test: &str,
+    device_id: u32,
+    page_size: usize,
+    concurrency: usize,
+    bounce_blocks: Option<usize>,
+    config: &BenchConfig,
+    latencies: Vec<Duration>,
+) -> BenchResult {
+    let bytes_per_block = compute_bytes_per_block(config, page_size);
+    let bytes_per_iter = bytes_per_block * config.blocks_per_batch * concurrency;
+    let stats = LatencyStats::from_durations(latencies);
+    let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3); // bytes / ns = GB/s
+    let num_devices = config.devices.len();
+    let aggregate_bandwidth_gbs = bandwidth_gbs * num_devices as f64;
+
+    BenchResult {
+        test: test.to_string(),
+        device_id,
+        page_size,
+        blocks_per_batch: config.blocks_per_batch,
+        concurrency,
+        bounce_blocks,
+        bytes_per_iter,
+        iterations: config.iterations,
+        latency_us: stats,
+        bandwidth_gbs,
+        aggregate_bandwidth_gbs,
+    }
+}
+
+fn print_result_stderr(r: &BenchResult) {
+    eprintln!(
+        "[GPU {}] {} | page={} conc={}{} | {:.1} GB/s (per-dev) {:.1} GB/s (agg) | p50={:.0}us p99={:.0}us",
+        r.device_id,
+        r.test,
+        r.page_size,
+        r.concurrency,
+        r.bounce_blocks
+            .map(|b| format!(" bounce={b}"))
+            .unwrap_or_default(),
+        r.bandwidth_gbs,
+        r.aggregate_bandwidth_gbs,
+        r.latency_us.p50_us,
+        r.latency_us.p99_us,
+    );
+}
+
+// ─── Worker Thread Infrastructure ──────────────────────────────────────────────
+
+struct WorkerHandle {
+    worker: Arc<DirectWorker>,
+    join_handle: std::thread::JoinHandle<()>,
+    shutdown_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+/// Spawn a long-lived worker thread for a single GPU device.
+///
+/// The worker thread:
+/// 1. Pins to the device's NUMA node
+/// 2. Creates its own tokio runtime (2 worker threads)
+/// 3. Creates NixlAgent, TransferManager
+/// 4. Creates G1/G2/G3 PhysicalLayouts and registers them (NUMA-local allocations)
+/// 5. Builds DirectWorker and sends Arc back to leader
+/// 6. Waits on shutdown signal (keeps runtime alive for transfers)
+fn spawn_worker_thread(
+    device_id: u32,
+    page_size: usize,
+    config: &BenchConfig,
+) -> Result<WorkerHandle> {
+    let (ready_tx, ready_rx) = std::sync::mpsc::channel();
+    let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
+
+    let num_blocks = config.num_blocks;
+    let num_layers = config.num_layers;
+    let inner_dim = config.inner_dim;
+    let skip_disk = config.skip_disk;
+    let skip_gds = config.skip_gds;
+    let disk_path = config.disk_path.clone();
+
+    let join_handle = std::thread::Builder::new()
+        .name(format!("bench-gpu-{device_id}"))
+        .spawn(move || {
+            // Pin to device's NUMA node
+            if let Some(cpus) = dynamo_memory::numa::get_device_cpu_set(device_id) {
+                eprintln!(
+                    "[GPU {device_id}] Worker pinned to CPUs: {}",
+                    format_cpu_set(&cpus)
+                );
+                pin_thread_to_cpus(&cpus);
+            } else {
+                if let Some(node) = dynamo_memory::numa::get_device_numa_node(device_id) {
+                    eprintln!("[GPU {device_id}] Worker pinned to NUMA node {node}");
+                    let _ = dynamo_memory::numa::pin_thread_to_numa_node(node);
+                } else {
+                    eprintln!("[GPU {device_id}] No NUMA pinning (node unknown)");
+                }
+            }
+
+            // Build tokio runtime on this NUMA-pinned thread
+            let rt = tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(2)
+                .enable_all()
+                .thread_name(format!("bench-gpu-{device_id}-tokio"))
+                .build()
+                .expect("failed to build tokio runtime");
+
+            let result = rt.block_on(async {
+                // Create a local EventManager for this worker's transfer notifications
+                let event_system = Arc::new(velo::EventManager::local());
+
+                // Create NixlAgent with available backends
+                let agent_name = format!("bench-gpu-{device_id}");
+                let mut agent = NixlAgent::new(&agent_name)?;
+                if !skip_disk && agent.add_backend("POSIX").is_err() {
+                    eprintln!("[GPU {device_id}] POSIX backend unavailable");
+                }
+                if !skip_gds && !skip_disk && agent.add_backend("GDS_MT").is_err() {
+                    eprintln!("[GPU {device_id}] GDS_MT backend unavailable");
+                }
+
+                // Create TransferManager
+                let manager = TransferManager::builder()
+                    .event_system(event_system)
+                    .nixl_agent(agent.clone())
+                    .cuda_device_id(device_id as usize)
+                    .build()?;
+
+                // Build layout config
+                let layout_config = LayoutConfig::builder()
+                    .num_blocks(num_blocks)
+                    .num_layers(num_layers)
+                    .outer_dim(2) // K + V
+                    .page_size(page_size)
+                    .inner_dim(inner_dim)
+                    .dtype_width_bytes(2) // fp16
+                    .build()?;
+
+                // Allocate G1 (GPU device memory) — NUMA-local allocation
+                let g1 = PhysicalLayout::builder(agent.clone())
+                    .with_config(layout_config.clone())
+                    .fully_contiguous()
+                    .allocate_device(device_id)
+                    .build()?;
+                let g1_handle = manager.register_layout(g1)?;
+
+                // Allocate G2 (pinned host memory) — NUMA-local allocation
+                let g2 = PhysicalLayout::builder(agent.clone())
+                    .with_config(layout_config.clone())
+                    .fully_contiguous()
+                    .allocate_pinned(Some(device_id))
+                    .build()?;
+                let g2_handle = manager.register_layout(g2)?;
+
+                // Allocate G3 (disk) if enabled
+                let g3_handle = if !skip_disk {
+                    let g3 = PhysicalLayout::builder(agent.clone())
+                        .with_config(layout_config)
+                        .fully_contiguous()
+                        .allocate_disk(disk_path)
+                        .build()?;
+                    Some(manager.register_layout(g3)?)
+                } else {
+                    None
+                };
+
+                // Build DirectWorker (PhysicalWorker)
+                let mut worker_builder = DirectWorker::builder()
+                    .manager(manager)
+                    .g1_handle(g1_handle)
+                    .g2_handle(g2_handle);
+                if let Some(g3) = g3_handle {
+                    worker_builder = worker_builder.g3_handle(g3);
+                }
+                let worker = Arc::new(worker_builder.build()?);
+
+                ready_tx.send(Ok(worker.clone())).ok();
+
+                // Keep runtime alive so TransferManager notification threads stay running
+                let _ = shutdown_rx.await;
+
+                Ok::<(), anyhow::Error>(())
+            });
+
+            if let Err(e) = result {
+                ready_tx.send(Err(e)).ok();
+            }
+        })
+        .expect("failed to spawn worker thread");
+
+    // Wait for worker to be ready
+    let worker = ready_rx
+        .recv()
+        .map_err(|_| anyhow::anyhow!("Worker thread died before sending ready signal"))??;
+
+    Ok(WorkerHandle {
+        worker,
+        join_handle,
+        shutdown_tx,
+    })
+}
+
+// ─── BenchInstance: Full Leader+Worker Setup ───────────────────────────────────
+
+struct BenchInstance {
+    leader: InstanceLeader,
+    #[allow(dead_code)]
+    registry: BlockRegistry,
+    g2_manager: Arc<BlockManager<G2>>,
+    #[allow(dead_code)]
+    g3_manager: Option<Arc<BlockManager<G3>>>,
+    offload_engine: Option<OffloadEngine>,
+    worker_handles: Vec<WorkerHandle>,
+    config: BenchConfig,
+    page_size: usize,
+}
+
+impl BenchInstance {
+    /// Create a full leader+worker bench instance for a given page_size.
+    ///
+    /// One leader with SpmdParallelWorkers, N DirectWorkers (one per GPU device),
+    /// each on a NUMA-pinned thread with its own tokio runtime and TransferManager.
+    async fn new(config: BenchConfig, page_size: usize) -> Result<Self> {
+        let num_devices = config.devices.len();
+        eprintln!(
+            "Setting up BenchInstance: page_size={page_size}, {} device(s)",
+            num_devices
+        );
+
+        // Spawn worker threads (one per device)
+        let mut worker_handles = Vec::with_capacity(num_devices);
+        for &device_id in &config.devices {
+            let handle = spawn_worker_thread(device_id, page_size, &config)?;
+            eprintln!("[GPU {device_id}] Worker ready");
+            worker_handles.push(handle);
+        }
+
+        // Collect DirectWorker references for the leader
+        let worker_refs: Vec<Arc<dyn Worker>> = worker_handles
+            .iter()
+            .map(|wh| wh.worker.clone() as Arc<dyn Worker>)
+            .collect();
+
+        // Create leader Messenger (needed by InstanceLeader for event system)
+        let leader_messenger = create_messenger_tcp().await?;
+
+        // Build BlockRegistry and BlockManagers
+        let registry = TestRegistryBuilder::new().build();
+        let g2_manager = Arc::new(
+            TestManagerBuilder::<G2>::new()
+                .block_count(config.num_blocks)
+                .block_size(page_size)
+                .registry(registry.clone())
+                .build(),
+        );
+        let g3_manager = if !config.skip_disk {
+            Some(Arc::new(
+                TestManagerBuilder::<G3>::new()
+                    .block_count(config.num_blocks)
+                    .block_size(page_size)
+                    .registry(registry.clone())
+                    .build(),
+            ))
+        } else {
+            None
+        };
+
+        // Build InstanceLeader with direct worker references
+        let mut leader_builder = InstanceLeader::builder()
+            .messenger(leader_messenger)
+            .registry(registry.clone())
+            .g2_manager(g2_manager.clone())
+            .workers(worker_refs);
+
+        if let Some(ref g3m) = g3_manager {
+            leader_builder = leader_builder.g3_manager(g3m.clone());
+        }
+
+        let leader = leader_builder.build()?;
+
+        // Build OffloadEngine if requested
+        let offload_engine = if config.offload {
+            let mut engine_builder = OffloadEngine::builder(Arc::new(leader.clone()))
+                .with_registry(Arc::new(registry.clone()))
+                .with_g2_manager(g2_manager.clone())
+                .with_runtime(tokio::runtime::Handle::current());
+
+            if let Some(ref g3m) = g3_manager {
+                engine_builder = engine_builder.with_g3_manager(g3m.clone());
+            }
+
+            // Configure G1→G2 pipeline with a pass-through presence filter
+            let g1_to_g2_config = PipelineBuilder::<G1, G2>::new()
+                .policy(Arc::new(PresenceFilter::<G1, G2>::new(Arc::new(
+                    registry.clone(),
+                ))))
+                .batch_size(64)
+                .max_concurrent_transfers(4)
+                .build();
+            engine_builder = engine_builder.with_g1_to_g2_pipeline(g1_to_g2_config);
+
+            // Configure G2→G3 pipeline if disk enabled
+            if g3_manager.is_some() {
+                let g2_to_g3_config = PipelineBuilder::<G2, G3>::new()
+                    .policy(Arc::new(PresenceFilter::<G2, G3>::new(Arc::new(
+                        registry.clone(),
+                    ))))
+                    .batch_size(64)
+                    .max_concurrent_transfers(4)
+                    .build();
+                engine_builder = engine_builder.with_g2_to_g3_pipeline(g2_to_g3_config);
+            }
+
+            Some(engine_builder.build()?)
+        } else {
+            None
+        };
+
+        Ok(Self {
+            leader,
+            registry,
+            g2_manager,
+            g3_manager,
+            offload_engine,
+            worker_handles,
+            config,
+            page_size,
+        })
+    }
+
+    /// Run all benchmark phases and return results.
+    async fn run_benchmarks(&self) -> Result<Vec<BenchResult>> {
+        let mut results = Vec::new();
+
+        if !self.config.bidir_only {
+            eprintln!(
+                "=== Phase 1: Isolated Transfers (page_size={}) ===",
+                self.page_size
+            );
+            results.extend(self.bench_isolated_transfers().await?);
+        }
+
+        if !self.config.isolated_only {
+            eprintln!(
+                "=== Phase 2: Bidirectional Contention (page_size={}) ===",
+                self.page_size
+            );
+            results.extend(self.bench_bidir_transfers().await?);
+        }
+
+        if self.config.offload && self.offload_engine.is_some() {
+            eprintln!(
+                "=== Phase 3: Offload Pipeline (page_size={}) ===",
+                self.page_size
+            );
+            results.extend(self.bench_offload_pipeline().await?);
+        }
+
+        Ok(results)
+    }
+
+    // ─── Phase 1: Isolated Transfers ───────────────────────────────────────
+
+    async fn bench_isolated_transfers(&self) -> Result<Vec<BenchResult>> {
+        let mut results = Vec::new();
+        let device_id = self.config.devices[0]; // Report results under first device
+        let parallel_worker = self
+            .leader
+            .parallel_worker()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker available"))?;
+
+        for &conc in &self.config.concurrency {
+            let bpb = self.config.blocks_per_batch;
+            let block_ids: Arc<[BlockId]> =
+                Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
+
+            // G1→G2 (D2H offload)
+            let latencies = self
+                .bench_transfer(
+                    &*parallel_worker,
+                    LogicalLayoutHandle::G1,
+                    LogicalLayoutHandle::G2,
+                    block_ids.clone(),
+                    block_ids.clone(),
+                )
+                .await?;
+            let r = make_result(
+                "g1_to_g2",
+                device_id,
+                self.page_size,
+                conc,
+                None,
+                &self.config,
+                latencies,
+            );
+            print_result_stderr(&r);
+            results.push(r);
+
+            // G2→G1 (H2D onboard)
+            let latencies = self
+                .bench_transfer(
+                    &*parallel_worker,
+                    LogicalLayoutHandle::G2,
+                    LogicalLayoutHandle::G1,
+                    block_ids.clone(),
+                    block_ids.clone(),
+                )
+                .await?;
+            let r = make_result(
+                "g2_to_g1",
+                device_id,
+                self.page_size,
+                conc,
+                None,
+                &self.config,
+                latencies,
+            );
+            print_result_stderr(&r);
+            results.push(r);
+
+            // G2↔G3 tests (if disk enabled)
+            if !self.config.skip_disk {
+                // G2→G3
+                let latencies = self
+                    .bench_transfer(
+                        &*parallel_worker,
+                        LogicalLayoutHandle::G2,
+                        LogicalLayoutHandle::G3,
+                        block_ids.clone(),
+                        block_ids.clone(),
+                    )
+                    .await?;
+                let r = make_result(
+                    "g2_to_g3",
+                    device_id,
+                    self.page_size,
+                    conc,
+                    None,
+                    &self.config,
+                    latencies,
+                );
+                print_result_stderr(&r);
+                results.push(r);
+
+                // G3→G2
+                let latencies = self
+                    .bench_transfer(
+                        &*parallel_worker,
+                        LogicalLayoutHandle::G3,
+                        LogicalLayoutHandle::G2,
+                        block_ids.clone(),
+                        block_ids.clone(),
+                    )
+                    .await?;
+                let r = make_result(
+                    "g3_to_g2",
+                    device_id,
+                    self.page_size,
+                    conc,
+                    None,
+                    &self.config,
+                    latencies,
+                );
+                print_result_stderr(&r);
+                results.push(r);
+            }
+        }
+
+        // G1↔G3 direct tests (GDS or bounce-buffer-free path)
+        if !self.config.skip_disk {
+            // GDS direct tests (G1↔G3 without bounce)
+            if !self.config.skip_gds {
+                for &conc in &self.config.concurrency {
+                    let bpb = self.config.blocks_per_batch;
+                    let block_ids: Arc<[BlockId]> =
+                        Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
+
+                    // G1→G3 direct (GDS)
+                    match self
+                        .bench_transfer(
+                            &*parallel_worker,
+                            LogicalLayoutHandle::G1,
+                            LogicalLayoutHandle::G3,
+                            block_ids.clone(),
+                            block_ids.clone(),
+                        )
+                        .await
+                    {
+                        Ok(latencies) => {
+                            let r = make_result(
+                                "g1_to_g3_gds",
+                                device_id,
+                                self.page_size,
+                                conc,
+                                None,
+                                &self.config,
+                                latencies,
+                            );
+                            print_result_stderr(&r);
+                            results.push(r);
+                        }
+                        Err(e) => {
+                            eprintln!("GDS g1_to_g3 failed (GDS may not be available): {e}");
+                        }
+                    }
+
+                    // G3→G1 direct (GDS)
+                    match self
+                        .bench_transfer(
+                            &*parallel_worker,
+                            LogicalLayoutHandle::G3,
+                            LogicalLayoutHandle::G1,
+                            block_ids.clone(),
+                            block_ids.clone(),
+                        )
+                        .await
+                    {
+                        Ok(latencies) => {
+                            let r = make_result(
+                                "g3_to_g1_gds",
+                                device_id,
+                                self.page_size,
+                                conc,
+                                None,
+                                &self.config,
+                                latencies,
+                            );
+                            print_result_stderr(&r);
+                            results.push(r);
+                        }
+                        Err(e) => {
+                            eprintln!("GDS g3_to_g1 failed (GDS may not be available): {e}");
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(results)
+    }
+
+    // ─── Phase 2: Bidirectional Contention ─────────────────────────────────
+
+    async fn bench_bidir_transfers(&self) -> Result<Vec<BenchResult>> {
+        let mut results = Vec::new();
+        let device_id = self.config.devices[0];
+        let parallel_worker = self
+            .leader
+            .parallel_worker()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker available"))?;
+
+        let bidir_concurrencies: Vec<usize> = self
+            .config
+            .concurrency
+            .iter()
+            .copied()
+            .filter(|&c| c <= 4)
+            .collect();
+
+        for &conc in &bidir_concurrencies {
+            let bpb = self.config.blocks_per_batch;
+            let total_blocks_needed = 2 * conc * bpb;
+
+            if total_blocks_needed > self.config.num_blocks {
+                eprintln!(
+                    "Skipping bidir page_size={} conc={conc}: need {total_blocks_needed} blocks but only have {}",
+                    self.page_size, self.config.num_blocks
+                );
+                continue;
+            }
+
+            // D2H block range: [0..conc*bpb)
+            let d2h_ids: Arc<[BlockId]> =
+                Arc::from((0..conc * bpb).map(|i| i as BlockId).collect::<Vec<_>>());
+            // H2D block range: [conc*bpb..2*conc*bpb)
+            let h2d_ids: Arc<[BlockId]> = Arc::from(
+                (conc * bpb..2 * conc * bpb)
+                    .map(|i| i as BlockId)
+                    .collect::<Vec<_>>(),
+            );
+
+            // Warmup
+            for _ in 0..self.config.warmup {
+                let d2h_notif = parallel_worker.execute_local_transfer(
+                    LogicalLayoutHandle::G1,
+                    LogicalLayoutHandle::G2,
+                    d2h_ids.clone(),
+                    d2h_ids.clone(),
+                    TransferOptions::default(),
+                )?;
+                let h2d_notif = parallel_worker.execute_local_transfer(
+                    LogicalLayoutHandle::G2,
+                    LogicalLayoutHandle::G1,
+                    h2d_ids.clone(),
+                    h2d_ids.clone(),
+                    TransferOptions::default(),
+                )?;
+                d2h_notif.await?;
+                h2d_notif.await?;
+            }
+
+            // Measure
+            let mut d2h_latencies = Vec::with_capacity(self.config.iterations);
+            let mut h2d_latencies = Vec::with_capacity(self.config.iterations);
+
+            for _ in 0..self.config.iterations {
+                let start = Instant::now();
+
+                let d2h_notif = parallel_worker.execute_local_transfer(
+                    LogicalLayoutHandle::G1,
+                    LogicalLayoutHandle::G2,
+                    d2h_ids.clone(),
+                    d2h_ids.clone(),
+                    TransferOptions::default(),
+                )?;
+                let h2d_notif = parallel_worker.execute_local_transfer(
+                    LogicalLayoutHandle::G2,
+                    LogicalLayoutHandle::G1,
+                    h2d_ids.clone(),
+                    h2d_ids.clone(),
+                    TransferOptions::default(),
+                )?;
+
+                d2h_notif.await?;
+                let d2h_elapsed = start.elapsed();
+                h2d_notif.await?;
+                let h2d_elapsed = start.elapsed();
+
+                d2h_latencies.push(d2h_elapsed);
+                h2d_latencies.push(h2d_elapsed);
+            }
+
+            let r = make_result(
+                "bidir_g1_to_g2",
+                device_id,
+                self.page_size,
+                conc,
+                None,
+                &self.config,
+                d2h_latencies,
+            );
+            print_result_stderr(&r);
+            results.push(r);
+
+            let r = make_result(
+                "bidir_g2_to_g1",
+                device_id,
+                self.page_size,
+                conc,
+                None,
+                &self.config,
+                h2d_latencies,
+            );
+            print_result_stderr(&r);
+            results.push(r);
+        }
+
+        Ok(results)
+    }
+
+    // ─── Phase 3: Offload Pipeline ─────────────────────────────────────────
+
+    async fn bench_offload_pipeline(&self) -> Result<Vec<BenchResult>> {
+        let mut results = Vec::new();
+        let device_id = self.config.devices[0];
+        let engine = self
+            .offload_engine
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("OffloadEngine not configured"))?;
+
+        // Populate G2 manager with test blocks so the registry has entries
+        let token_seq =
+            token_blocks::create_token_sequence(self.config.num_blocks, self.page_size, 0);
+        let seq_hashes = populate_manager_with_blocks(&self.g2_manager, token_seq.blocks())?;
+
+        for &batch_size in &self.config.offload_batch_sizes {
+            if batch_size > self.config.num_blocks {
+                eprintln!(
+                    "Skipping offload batch_size={batch_size}: exceeds num_blocks={}",
+                    self.config.num_blocks
+                );
+                continue;
+            }
+
+            for &conc in &self.config.offload_concurrency {
+                eprintln!("Offload G1→G2 pipeline: batch_size={batch_size} concurrency={conc}");
+
+                // Warmup
+                for _ in 0..self.config.warmup {
+                    let blocks: Vec<ExternalBlock<G1>> = (0..batch_size)
+                        .map(|i| ExternalBlock::new(i as BlockId, seq_hashes[i]))
+                        .collect();
+                    let mut handle = engine.enqueue_g1_to_g2(SourceBlocks::External(blocks))?;
+                    handle.wait().await?;
+                }
+
+                // Measure
+                let mut latencies = Vec::with_capacity(self.config.iterations);
+                for _ in 0..self.config.iterations {
+                    let blocks: Vec<ExternalBlock<G1>> = (0..batch_size)
+                        .map(|i| ExternalBlock::new(i as BlockId, seq_hashes[i]))
+                        .collect();
+
+                    let start = Instant::now();
+                    let mut handle = engine.enqueue_g1_to_g2(SourceBlocks::External(blocks))?;
+                    handle.wait().await?;
+                    latencies.push(start.elapsed());
+                }
+
+                let bytes_per_block = compute_bytes_per_block(&self.config, self.page_size);
+                let bytes_per_iter = bytes_per_block * batch_size;
+                let stats = LatencyStats::from_durations(latencies);
+                let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3);
+                let num_devices = self.config.devices.len();
+
+                let r = BenchResult {
+                    test: "offload_g1_to_g2_pipeline".to_string(),
+                    device_id,
+                    page_size: self.page_size,
+                    blocks_per_batch: batch_size,
+                    concurrency: conc,
+                    bounce_blocks: None,
+                    bytes_per_iter,
+                    iterations: self.config.iterations,
+                    latency_us: stats,
+                    bandwidth_gbs,
+                    aggregate_bandwidth_gbs: bandwidth_gbs * num_devices as f64,
+                };
+                print_result_stderr(&r);
+                results.push(r);
+            }
+        }
+
+        // G2→G3 pipeline if disk enabled
+        if !self.config.skip_disk && engine.has_g2_to_g3() {
+            for &batch_size in &self.config.offload_batch_sizes {
+                if batch_size > self.config.num_blocks {
+                    continue;
+                }
+
+                for &conc in &self.config.offload_concurrency {
+                    eprintln!("Offload G2→G3 pipeline: batch_size={batch_size} concurrency={conc}");
+
+                    // Get immutable blocks from g2_manager for SourceBlocks::Strong
+                    let matched = self.g2_manager.match_blocks(&seq_hashes[..batch_size]);
+
+                    // Warmup
+                    for _ in 0..self.config.warmup {
+                        let mut handle =
+                            engine.enqueue_g2_to_g3(SourceBlocks::Strong(matched.clone()))?;
+                        handle.wait().await?;
+                    }
+
+                    // Measure
+                    let mut latencies = Vec::with_capacity(self.config.iterations);
+                    for _ in 0..self.config.iterations {
+                        let start = Instant::now();
+                        let mut handle =
+                            engine.enqueue_g2_to_g3(SourceBlocks::Strong(matched.clone()))?;
+                        handle.wait().await?;
+                        latencies.push(start.elapsed());
+                    }
+
+                    let bytes_per_block = compute_bytes_per_block(&self.config, self.page_size);
+                    let bytes_per_iter = bytes_per_block * batch_size;
+                    let stats = LatencyStats::from_durations(latencies);
+                    let bandwidth_gbs = bytes_per_iter as f64 / (stats.mean_us * 1e3);
+                    let num_devices = self.config.devices.len();
+
+                    let r = BenchResult {
+                        test: "offload_g2_to_g3_pipeline".to_string(),
+                        device_id,
+                        page_size: self.page_size,
+                        blocks_per_batch: batch_size,
+                        concurrency: conc,
+                        bounce_blocks: None,
+                        bytes_per_iter,
+                        iterations: self.config.iterations,
+                        latency_us: stats,
+                        bandwidth_gbs,
+                        aggregate_bandwidth_gbs: bandwidth_gbs * num_devices as f64,
+                    };
+                    print_result_stderr(&r);
+                    results.push(r);
+                }
+            }
+        }
+
+        Ok(results)
+    }
+
+    // ─── Transfer Helpers ──────────────────────────────────────────────────
+
+    /// Benchmark a single transfer direction via the parallel worker (SPMD).
+    async fn bench_transfer(
+        &self,
+        parallel_worker: &dyn WorkerTransfers,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: Arc<[BlockId]>,
+        dst_block_ids: Arc<[BlockId]>,
+    ) -> Result<Vec<Duration>> {
+        self.bench_transfer_with_options(
+            parallel_worker,
+            src,
+            dst,
+            src_block_ids,
+            dst_block_ids,
+            TransferOptions::default(),
+        )
+        .await
+    }
+
+    /// Benchmark a transfer with custom TransferOptions (e.g., bounce buffer).
+    async fn bench_transfer_with_options(
+        &self,
+        parallel_worker: &dyn WorkerTransfers,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: Arc<[BlockId]>,
+        dst_block_ids: Arc<[BlockId]>,
+        options: TransferOptions,
+    ) -> Result<Vec<Duration>> {
+        // Warmup
+        for _ in 0..self.config.warmup {
+            let notif = parallel_worker.execute_local_transfer(
+                src,
+                dst,
+                src_block_ids.clone(),
+                dst_block_ids.clone(),
+                options.clone(),
+            )?;
+            notif.await?;
+        }
+
+        // Measure
+        let mut latencies = Vec::with_capacity(self.config.iterations);
+        for _ in 0..self.config.iterations {
+            let start = Instant::now();
+            let notif = parallel_worker.execute_local_transfer(
+                src,
+                dst,
+                src_block_ids.clone(),
+                dst_block_ids.clone(),
+                options.clone(),
+            )?;
+            notif.await?;
+            latencies.push(start.elapsed());
+        }
+
+        Ok(latencies)
+    }
+
+    /// Shutdown all workers.
+    fn shutdown(self) {
+        for handle in self.worker_handles {
+            handle.shutdown_tx.send(()).ok();
+            handle.join_handle.join().ok();
+        }
+    }
+}
+
+// ─── NUMA Pinning Helpers ──────────────────────────────────────────────────────
+
+fn pin_thread_to_cpus(cpus: &[usize]) {
+    unsafe {
+        let mut cpu_set: libc::cpu_set_t = std::mem::zeroed();
+        for &cpu in cpus {
+            libc::CPU_SET(cpu, &mut cpu_set);
+        }
+        libc::sched_setaffinity(0, std::mem::size_of::<libc::cpu_set_t>(), &cpu_set);
+    }
+}
+
+fn format_cpu_set(cpus: &[usize]) -> String {
+    if cpus.is_empty() {
+        return String::new();
+    }
+    // Compress into ranges: [0,1,2,3,8,9,10] -> "0-3,8-10"
+    let mut parts = Vec::new();
+    let mut start = cpus[0];
+    let mut end = cpus[0];
+
+    for &cpu in &cpus[1..] {
+        if cpu == end + 1 {
+            end = cpu;
+        } else {
+            if start == end {
+                parts.push(format!("{start}"));
+            } else {
+                parts.push(format!("{start}-{end}"));
+            }
+            start = cpu;
+            end = cpu;
+        }
+    }
+    if start == end {
+        parts.push(format!("{start}"));
+    } else {
+        parts.push(format!("{start}-{end}"));
+    }
+    parts.join(",")
+}
+
+// ─── Validation ────────────────────────────────────────────────────────────────
+
+fn validate_config(config: &BenchConfig) -> Result<()> {
+    let max_conc = config.concurrency.iter().max().copied().unwrap_or(1);
+    let max_bounce = config.bounce_blocks.iter().max().copied().unwrap_or(0);
+
+    // For bidir tests we need 2x the blocks (separate ranges for each direction)
+    let multiplier = if config.isolated_only { 1 } else { 2 };
+    let transfer_blocks = max_conc * config.blocks_per_batch * multiplier;
+
+    // Bounce blocks come from the tail of G2, so they must not overlap with
+    // the transfer block range [0..transfer_blocks).
+    let min_blocks = transfer_blocks + max_bounce;
+
+    ensure!(
+        config.num_blocks >= min_blocks,
+        "num_blocks ({}) must be >= max_concurrency ({}) * blocks_per_batch ({}) * {} + max_bounce ({}) = {}",
+        config.num_blocks,
+        max_conc,
+        config.blocks_per_batch,
+        multiplier,
+        max_bounce,
+        min_blocks,
+    );
+
+    ensure!(
+        !config.devices.is_empty(),
+        "must specify at least one device"
+    );
+    ensure!(
+        !config.page_sizes.is_empty(),
+        "must specify at least one page_size"
+    );
+    ensure!(
+        !config.concurrency.is_empty(),
+        "must specify at least one concurrency level"
+    );
+    ensure!(config.iterations > 0, "iterations must be > 0");
+
+    // Validate disk path if G3 tests enabled
+    if let Some(ref path) = config.disk_path
+        && !config.skip_disk
+    {
+        ensure!(
+            path.exists() || path.parent().is_some_and(|p| p.exists()),
+            "disk path {} does not exist",
+            path.display()
+        );
+    }
+
+    // Validate offload config
+    if config.offload {
+        ensure!(
+            !config.offload_batch_sizes.is_empty(),
+            "offload enabled but no batch sizes specified"
+        );
+        ensure!(
+            !config.offload_concurrency.is_empty(),
+            "offload enabled but no concurrency levels specified"
+        );
+    }
+
+    Ok(())
+}
+
+// ─── Main ──────────────────────────────────────────────────────────────────────
+
+fn main() -> Result<()> {
+    // Initialize tracing for debug output
+    tracing_subscriber_init();
+
+    let cli = Cli::parse();
+    let config = build_config(cli)?;
+    validate_config(&config)?;
+
+    eprintln!("KVBM Engine Benchmark (Leader+Worker Architecture)");
+    eprintln!("  Devices: {:?}", config.devices);
+    eprintln!("  Page sizes: {:?}", config.page_sizes);
+    eprintln!("  Concurrency: {:?}", config.concurrency);
+    eprintln!("  Blocks per batch: {}", config.blocks_per_batch);
+    eprintln!("  Total blocks per pool: {}", config.num_blocks);
+    eprintln!(
+        "  Layers: {}, Inner dim: {}",
+        config.num_layers, config.inner_dim
+    );
+    eprintln!(
+        "  Warmup: {}, Iterations: {}",
+        config.warmup, config.iterations
+    );
+    eprintln!(
+        "  Disk: {}",
+        if config.skip_disk {
+            "disabled"
+        } else {
+            "enabled"
+        }
+    );
+    eprintln!(
+        "  GDS: {}",
+        if config.skip_gds {
+            "disabled"
+        } else {
+            "enabled"
+        }
+    );
+    if config.offload {
+        eprintln!("  Offload: enabled");
+        eprintln!("    Batch sizes: {:?}", config.offload_batch_sizes);
+        eprintln!("    Concurrency: {:?}", config.offload_concurrency);
+    }
+    eprintln!();
+
+    // Build a main-thread tokio runtime for the leader
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(4)
+        .enable_all()
+        .thread_name("bench-leader-tokio")
+        .build()?;
+
+    let all_results = rt.block_on(async {
+        let mut all_results: Vec<BenchResult> = Vec::new();
+
+        // Page-size sweep: rebuild full worker stack per page_size
+        // (mirrors production where model config determines page_size at startup)
+        for &page_size in &config.page_sizes {
+            eprintln!("\n{}", "=".repeat(72));
+            eprintln!("Page size: {page_size}");
+            eprintln!("{}", "=".repeat(72));
+
+            let instance = BenchInstance::new(config.clone(), page_size).await?;
+            let results = instance.run_benchmarks().await?;
+            all_results.extend(results);
+            instance.shutdown();
+        }
+
+        Ok::<_, anyhow::Error>(all_results)
+    })?;
+
+    // Build timestamped output directory: <cwd>/YYMMDD-HH:MM:SS-bench-engine/
+    let now = chrono::Local::now();
+    let dir_name = now.format("%y%m%d-%H:%M:%S-bench-engine").to_string();
+    let out_dir = if let Some(ref base) = config.output {
+        base.join(&dir_name)
+    } else {
+        PathBuf::from(&dir_name)
+    };
+    std::fs::create_dir_all(&out_dir)?;
+
+    // Write JSON Lines results
+    let json_output: String = all_results
+        .iter()
+        .map(|r| serde_json::to_string(r).unwrap())
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    let jsonl_path = out_dir.join(format!("{dir_name}.jsonl"));
+    std::fs::write(&jsonl_path, &json_output)?;
+
+    // Copy the viewer HTML into the output directory
+    let viewer_html = include_str!("../scripts/bench_viewer.html");
+    let viewer_path = out_dir.join(format!("{dir_name}.html"));
+    std::fs::write(&viewer_path, viewer_html)?;
+
+    eprintln!(
+        "\nBenchmark complete. {} results collected.",
+        all_results.len()
+    );
+    eprintln!("Results directory: {}", out_dir.display());
+    eprintln!("  {}", jsonl_path.display());
+    eprintln!("  {}", viewer_path.display());
+    Ok(())
+}
+
+fn tracing_subscriber_init() {
+    use std::env;
+    if env::var("RUST_LOG").is_err() {
+        // SAFETY: Called at program start before any threads are spawned.
+        unsafe { env::set_var("RUST_LOG", "error") };
+    }
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_writer(std::io::stderr)
+        .init();
+}
--- a/lib/kvbm-engine/docs/architecture.md
+++ b/lib/kvbm-engine/docs/architecture.md
+# kvbm-engine
+
+`kvbm-engine` provides distributed coordination primitives for KV Block Management (KVBM).
+It implements a tiered storage model where KV cache blocks flow between GPU memory, host
+DRAM, local disk, and object storage. The crate coordinates leaders (which own block
+metadata and make placement decisions) with workers (which execute data transfers via
+RDMA, NVMe, or object storage APIs).
+
+## Storage Tier Model
+
+| Tier | Medium | Latency | Capacity | Description |
+|------|--------|---------|----------|-------------|
+| G1 | GPU HBM | ~ns | Smallest | Active KV cache used by attention kernels |
+| G2 | Pinned DRAM | ~us | Medium | Staging area for RDMA transfers and tier promotion |
+| G3 | NVMe/SSD | ~ms | Large | Persistent warm-block storage |
+| G4 | S3/MinIO | ~100ms | Unlimited | Cold/archival object storage |
+
+## Architecture
+
+```text
+                    +-----------------+
+                    | InstanceLeader  |
+                    |  (find_matches, |
+                    |   BlockAccessor)|
+                    +--------+--------+
+                             |
+               +-------------+-------------+
+               |                           |
+      +--------v--------+        +--------v--------+
+      | CoordinatedWorker|       | CoordinatedWorker|
+      |   (rank 0)       |       |   (rank 1)       |
+      +--------+---------+       +--------+---------+
+               |                           |
+      +--------v--------+        +--------v--------+
+      | PhysicalWorker   |       | PhysicalWorker   |
+      | (TransferManager)|       | (TransferManager)|
+      +-----------------+        +-----------------+
+```
+
+The leader drives workers through the `ParallelWorkers` trait (`SpmdParallelWorkers`
+for SPMD execution). For onboarding, the leader creates sessions that progress through
+stages: search, hold, prepare (G3->G2), and pull (remote G2->local G2 via RDMA).
+
+## Modules
+
+| Module | Purpose |
+|--------|---------|
+| `leader` | Block coordination: matching, onboarding sessions, policy-based scanning |
+| `worker` | Transfer execution: local, RDMA, and object storage data movement |
+| `object` | G4 storage: S3/MinIO client for cold-tier block persistence |
+| `offload` | Tier demotion pipeline: batched G2->G3 and G2->G4 offloading |
+| `runtime` | Shared infrastructure: `KvbmRuntime`, tokio handle, NIXL agent |
+| `pubsub` | Event pub/sub: block-level notifications for cross-instance coordination |
+| `collectives` | NCCL collectives for multi-GPU synchronization (feature-gated) |
+| `testing` | Test utilities: mock workers, in-memory block managers (feature-gated) |
+
+## Feature Flags
+
+| Flag | Dependencies | Description |
+|------|-------------|-------------|
+| `default` | `["s3"]` | Default features |
+| `s3` | `aws-sdk-s3`, `aws-config`, `rayon`, `tokio-rayon`, `chrono` | S3/MinIO object storage support |
+| `collectives` | `nixl-sys`, `nccl` | NIXL + NCCL multi-GPU collectives |
+| `nccl` | `cudarc` | NCCL support via cudarc |
+| `testing-nccl` | `collectives` | Enable collectives for tests |
+| `nats` | `async-nats`, `flume` | NATS-based pub/sub transport |
+| `testing` | `kvbm-logical/testing`, `kvbm-physical/testing` | Test utilities and mock infrastructure |
+| `nvtx` | `kvbm-config/nvtx` | NVIDIA Tools Extension profiling markers |
+
+## Quick Start
+
+```rust,ignore
+use kvbm_engine::{KvbmRuntime, leader::InstanceLeader};
+
+// Build runtime from environment
+let runtime = KvbmRuntime::from_env_leader().await?;
+
+// Create a leader instance
+let leader = InstanceLeader::new(/* ... */);
+
+// Search for cached blocks
+let result = leader.find_matches(&sequence_hashes)?;
+```
--- a/lib/kvbm-engine/docs/leader.md
+++ b/lib/kvbm-engine/docs/leader.md
+# Leader Module
+
+The leader module implements block coordination for a single KVBM instance. It owns
+block metadata (via `BlockManager<G2>` and `BlockManager<G3>`), resolves cache lookups,
+and orchestrates multi-stage onboarding sessions that move blocks between storage tiers
+and across instances.
+
+## Leader Trait
+
+The `Leader` trait defines the core coordination interface:
+
+```rust,ignore
+pub trait Leader: Send + Sync {
+    fn find_matches(&self, sequence_hashes: &[SequenceHash]) -> Result<FindMatchesResult>;
+    fn find_matches_with_options(
+        &self, sequence_hashes: &[SequenceHash], options: FindMatchesOptions,
+    ) -> Result<FindMatchesResult>;
+}
+```
+
+`find_matches` searches for blocks matching the given sequence hashes and returns
+either an immediate result or an async session depending on the staging mode and
+search scope.
+
+## InstanceLeader
+
+`InstanceLeader` is the primary implementation of `Leader`. It holds:
+- `BlockManager<G2>` and optional `BlockManager<G3>` for local block registries
+- A `ParallelWorkers` instance for driving transfer execution
+- Session state for active onboarding operations
+- Remote leader connections for cross-instance coordination
+
+## FindMatchesResult
+
+The result of `find_matches` is one of two variants:
+
+- **`Ready`** -- Returned when `search_remote == false` AND `staging_mode == Hold`.
+  Blocks are held in place via RAII without creating a session. The `ReadyResult`
+  directly owns `Vec<ImmutableBlock<G2>>`.
+
+- **`AsyncSession`** -- Returned when remote search or staging is required. Contains
+  a `SessionId`, a `watch::Receiver<OnboardingStatus>` for progress tracking, and
+  an optional `SessionHandle` for deferred control.
+
+## StagingMode
+
+Controls how matched blocks are staged and when the session completes:
+
+| Mode | Behavior | Session Lifetime |
+|------|----------|-----------------|
+| `Hold` | Blocks remain in their current tiers (G2/G3) on original instances | Stays alive for deferred operations |
+| `Prepare` | G3->G2 staging on all instances; no RDMA pulls | Stays alive after staging completes |
+| `Full` | G3->G2 everywhere, then RDMA pull remote G2->local G2 | Completes when all blocks are in local G2 |
+
+The progression `Hold -> Prepare -> Full` can be driven incrementally via
+`SessionHandle::prepare()` and `SessionHandle::pull()`.
+
+## OnboardingStatus State Machine
+
+```text
+Searching
+    |
+    +---> Holding { local_g2, local_g3, remote_g2, remote_g3, pending_g4, ... }
+    |         |
+    |         +---> (prepare) ---> Preparing { matched, staging_local, staging_remote }
+    |                                  |
+    +---> Preparing ------------------>+
+    |                                  |
+    |                            Prepared { local_g2, remote_g2 }
+    |                                  |
+    |                                  +---> (pull) ---> Staging { matched, ..., pulling }
+    |                                                        |
+    +---> Staging ------------------------------------------>+
+                                                             |
+                                                        Complete { matched_blocks }
+```
+
+Each status variant carries counters for progress tracking and cost analysis.
+`Holding` includes G4 load tracking (`pending_g4`, `loaded_g4`, `failed_g4`).
+
+## SessionHandle
+
+`SessionHandle` provides deferred control over `Hold` and `Prepare` sessions:
+
+- `prepare()` -- Trigger G3->G2 staging (Hold -> Prepare transition)
+- `pull()` -- Trigger RDMA pull of remote G2->local G2 (Prepare -> Complete)
+- `cancel()` -- Cancel session and release all held blocks
+
+Not available for `StagingMode::Full` (which runs to completion automatically).
+
+## BlockAccessor
+
+`BlockAccessor` provides a stateless, `Send + Sync` interface for policy-based
+block scanning. Each `find()` call independently searches G2 then G3, acquiring
+blocks via RAII. The companion `PolicyContext` adds result collection via
+`yield_item()` for streaming scan results back to the caller.
--- a/lib/kvbm-engine/docs/object.md
+++ b/lib/kvbm-engine/docs/object.md
+# Object Storage Module
+
+The object module provides traits and implementations for storing KV cache
+blocks in object storage systems (S3, MinIO). This corresponds to the G4
+(object store) tier in the storage hierarchy.
+
+## ObjectBlockOps Trait
+
+The primary trait for block-level object storage operations:
+
+| Method | Purpose |
+|--------|---------|
+| `has_blocks(keys)` | Check existence and size of blocks |
+| `put_blocks(keys, src_layout, block_ids)` | Upload blocks using logical layout handle |
+| `get_blocks(keys, dst_layout, block_ids)` | Download blocks using logical layout handle |
+| `put_blocks_with_layout(keys, layout, block_ids)` | Upload using resolved physical layout |
+| `get_blocks_with_layout(keys, layout, block_ids)` | Download using resolved physical layout |
+
+### Logical vs Physical Layout
+
+The trait offers two APIs for put/get:
+
+- **Logical** (`put_blocks` / `get_blocks`): Takes a `LogicalLayoutHandle` (G1, G2, G3).
+  Workers resolve this to their own physical layout internally. Used by the leader
+  (which doesn't have physical layouts) and by `CoordinatedWorker`.
+- **Physical** (`put_blocks_with_layout` / `get_blocks_with_layout`): Takes a resolved
+  `PhysicalLayout` directly. Used by `PhysicalWorker` after resolving its handles, and
+  by `S3ObjectBlockClient` which performs the actual I/O.
+
+## Key Formatting
+
+Keys map `SequenceHash` values to object storage paths:
+
+- **`DefaultKeyFormatter`**: Uses the hash's Display representation
+  (e.g., `0:abc123`). Suitable for single-worker scenarios.
+- **`RankPrefixedKeyFormatter`**: Prefixes with worker rank
+  (e.g., `0/0:abc123`). Required for SPMD workers where multiple workers
+  store the same logical block with different physical data.
+
+The `create_key_formatter(rank)` factory returns the appropriate formatter.
+
+## ObjectLockManager
+
+Distributed locking protocol for coordinated offloads to prevent duplicate
+uploads:
+
+```text
+has_meta(hash)
+  → true  → skip (already offloaded)
+  → false → try_acquire_lock(hash)
+              → true  → transfer → create_meta(hash) → release_lock(hash)
+              → false → skip (another instance owns it)
+```
+
+Uses conditional PUT (`If-None-Match: *`) for lock acquisition with deadline-based
+expiry for stale lock recovery.
+
+## S3 Implementation
+
+The `s3` submodule (feature-gated behind `s3`) provides:
+
+- **`S3ObjectBlockClient`**: Implements `ObjectBlockOps` for S3-compatible storage.
+  Supports concurrent uploads/downloads via `rayon` thread pool and contiguous
+  memory fast paths for aligned block data.
+- **`S3LockManager`**: Implements `ObjectLockManager` using S3 conditional writes.
+
+## Factory Functions
+
+- **`create_object_client(config, rank)`**: Creates an `Arc<dyn ObjectBlockOps>`
+  from configuration. Selects the backend (S3 or future alternatives) based on
+  `ObjectClientConfig`.
+- **`create_lock_manager(config, instance_id)`**: Creates an
+  `Arc<dyn ObjectLockManager>` for distributed lock coordination.
--- a/lib/kvbm-engine/docs/offload-developer.md
+++ b/lib/kvbm-engine/docs/offload-developer.md
+# Offload Module Developer Guide
+
+This document provides implementation details for developers working on the offload pipeline. For high-level concepts and policy statements, see [offload.md](offload.md).
+
+## Container-Based Architecture
+
+### OffloadContainer
+
+The container is the fundamental unit that flows through the pipeline:
+
+```rust,ignore
+struct OffloadContainer<T: BlockMetadata> {
+    /// Source blocks to transfer
+    blocks: Vec<SourceBlock<T>>,
+    /// Precondition event - Some before PreconditionAwaiter, None after
+    precondition: Option<EventHandle>,
+    /// Cancellation token (cloned from TransferHandle)
+    cancel_token: CancellationToken,
+}
+
+impl<T: BlockMetadata> OffloadContainer<T> {
+    /// Check if this container has been cancelled
+    fn is_cancelled(&self) -> bool {
+        self.cancel_token.is_requested()
+    }
+
+    /// Upgrade all blocks from Weak → Strong
+    /// Returns None if any block was evicted
+    fn upgrade(self) -> Option<UpgradedContainer<T>> {
+        // Implementation upgrades each SourceBlock
+    }
+}
+```
+
+### OffloadBatch
+
+Batches group multiple containers for efficient transfer:
+
+```rust,ignore
+struct OffloadBatch<T: BlockMetadata> {
+    containers: Vec<OffloadContainer<T>>,
+}
+
+impl<T: BlockMetadata> OffloadBatch<T> {
+    /// Total blocks across all containers
+    fn total_blocks(&self) -> usize {
+        self.containers.iter().map(|c| c.blocks.len()).sum()
+    }
+
+    /// Remove cancelled containers, return count removed
+    fn sweep_cancelled(&mut self) -> usize {
+        let before = self.containers.len();
+        self.containers.retain(|c| !c.is_cancelled());
+        before - self.containers.len()
+    }
+
+    /// Check if batch is empty
+    fn is_empty(&self) -> bool {
+        self.containers.is_empty()
+    }
+}
+```
+
+### Data Transformations Per Stage
+
+| Stage | Input | Output | Transform |
+|-------|-------|--------|-----------|
+| Enqueue | `Vec<SourceBlock<T>>` | `OffloadContainer<T>` | Wrap with token + precondition |
+| PolicyEvaluator | `OffloadContainer<T>` | `OffloadContainer<T>` | Filter `blocks` vec |
+| PreconditionAwaiter | `OffloadContainer<T>` | `OffloadContainer<T>` | Await event, set `precondition = None` |
+| Batcher | `OffloadContainer<T>` | `OffloadBatch<T>` | Group by total block count |
+| TransferExecutor | `OffloadBatch<T>` | `Vec<ImmutableBlock<T>>` | Sweep → Upgrade → Flat map |
+
+---
+
+## Token-Based Cancellation
+
+### Token Lifecycle
+
+1. **Creation**: At enqueue, create a `CancellationToken` pair
+2. **Distribution**: Handle gets the token, container gets a clone
+3. **Propagation**: Token travels with container through pipeline
+4. **Termination**: Token is consumed at upgrade (commitment point)
+
+```rust,ignore
+// At enqueue
+let (cancel_token, cancel_updater) = CancellationToken::new();
+
+// Give to handle
+let handle = TransferHandle { cancel_token: cancel_token.clone(), ... };
+
+// Give to container
+let container = OffloadContainer {
+    blocks,
+    precondition: Some(event),
+    cancel_token: cancel_token.clone(),
+};
+```
+
+### CancellationToken API
+
+```rust,ignore
+impl CancellationToken {
+    /// Request cancellation (called by handle)
+    fn request(&self);
+
+    /// Check if cancellation requested
+    fn is_requested(&self) -> bool;
+
+    /// Await cancellation request (for select!)
+    async fn wait_requested(&self);
+
+    /// Await confirmation that all blocks released
+    fn wait_confirmed(&self) -> CancelConfirmation;
+}
+```
+
+### PreconditionAwaiter Select Pattern
+
+The awaiter uses `select!` to handle both event completion and cancellation:
+
+```rust,ignore
+async fn process(&self, mut container: OffloadContainer<T>) {
+    // Fast path: event already satisfied
+    if let Some(ref event) = container.precondition {
+        if event.is_done() {
+            container.precondition = None;
+            self.output_queue.push(container);
+            return;
+        }
+    }
+
+    // Slow path: select on event OR cancellation
+    if let Some(event) = container.precondition.take() {
+        tokio::select! {
+            _ = event.wait() => {
+                // Event satisfied, propagate
+                self.output_queue.push(container);
+            }
+            _ = container.cancel_token.wait_requested() => {
+                // Cancelled while waiting - drop container
+                tracing::debug!("Container cancelled during precondition wait");
+                // container dropped here
+            }
+        }
+    } else {
+        // No precondition, pass through
+        self.output_queue.push(container);
+    }
+}
+```
+
+### CancellableQueue Sweep Mechanics
+
+The queue supports active cancellation via sweeping:
+
+```rust,ignore
+impl<T: HasCancellationToken> CancellableQueue<T> {
+    /// Push item, reject if already cancelled
+    fn push(&self, item: T) -> bool {
+        if item.cancel_token().is_requested() {
+            return false;
+        }
+        self.inner.push(item);
+        true
+    }
+
+    /// Pop, skipping cancelled items
+    fn pop_valid(&self) -> Option<T> {
+        loop {
+            match self.inner.pop() {
+                Some(item) if item.cancel_token().is_requested() => continue,
+                other => return other,
+            }
+        }
+    }
+
+    /// Remove all cancelled items
+    fn sweep(&self) -> usize {
+        let mut removed = 0;
+        let mut kept = Vec::new();
+
+        while let Some(item) = self.inner.pop() {
+            if item.cancel_token().is_requested() {
+                removed += 1;
+            } else {
+                kept.push(item);
+            }
+        }
+
+        for item in kept {
+            self.inner.push(item);
+        }
+        removed
+    }
+}
+```
+
+### Batch-Level Sweep
+
+For `CancellableQueue<OffloadBatch<T>>`, sweeping removes cancelled containers within batches:
+
+```rust,ignore
+fn sweep(&self) -> usize {
+    let mut removed_containers = 0;
+    let mut kept_batches = Vec::new();
+
+    while let Some(mut batch) = self.inner.pop() {
+        // Remove cancelled containers from this batch
+        removed_containers += batch.sweep_cancelled();
+
+        // Keep batch if it still has containers
+        if !batch.is_empty() {
+            kept_batches.push(batch);
+        }
+    }
+
+    for batch in kept_batches {
+        self.inner.push(batch);
+    }
+    removed_containers
+}
+```
+
+### Cancellation at Each Stage
+
+| Stage | Mechanism | Behavior |
+|-------|-----------|----------|
+| PolicyEvaluator | Token check | Check `is_cancelled()` between block evaluations |
+| PreconditionAwaiter | `select!` | Immediate drop if cancelled while waiting |
+| Batcher Queue | CancellableQueue | Sweep removes cancelled containers |
+| Executor Queue | CancellableQueue | Sweep removes cancelled containers from batches |
+| TransferExecutor | Final sweep | `batch.sweep_cancelled()` before upgrade |
+
+### Cancellation Boundary at Upgrade
+
+```text
+┌─────────────────────────────────────────────────────────────────────────┐
+│                        CANCELLABLE ZONE                                 │
+│                                                                         │
+│  Enqueue → PolicyEval → PrecondAwaiter → Batcher → ExecutorQueue        │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+                                                            │
+                                                            ▼
+                                                 ┌───────────────────┐
+                                                 │  sweep_cancelled  │
+                                                 │  (last check)     │
+                                                 └───────────────────┘
+                                                            │
+                                                            ▼
+═══════════════════════════════════════════════════════════════════════════
+                              UPGRADE BOUNDARY
+═══════════════════════════════════════════════════════════════════════════
+                                                            │
+                                                            ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                        COMMITTED ZONE                                   │
+│                                                                         │
+│  Upgrade → Flat Map → Transfer                                          │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## TransferExecutor Design
+
+### Sweep → Upgrade → Flat Map → Transfer
+
+```rust,ignore
+impl<T: BlockMetadata, D: TransferDestination> TransferExecutor<T, D> {
+    async fn run(self) {
+        while let Some(mut batch) = self.input_queue.pop() {
+            // 1. SWEEP: Last cancellation check
+            batch.sweep_cancelled();
+
+            if batch.is_empty() {
+                continue;
+            }
+
+            // 2. UPGRADE: Weak → Strong (commitment point)
+            let upgraded: Vec<UpgradedContainer<T>> = batch
+                .containers
+                .into_iter()
+                .filter_map(|c| c.upgrade())
+                .collect();
+
+            if upgraded.is_empty() {
+                continue;
+            }
+
+            // 3. FLAT MAP: Consolidate into single vec
+            let all_blocks: Vec<ImmutableBlock<T>> = upgraded
+                .into_iter()
+                .flat_map(|c| c.blocks)
+                .collect();
+
+            // 4. TRANSFER: Execute via destination
+            self.destination.execute_transfer(all_blocks).await;
+        }
+    }
+}
+```
+
+### Generic TransferDestination Trait
+
+```rust,ignore
+trait TransferDestination {
+    type Output;
+
+    async fn execute_transfer(
+        &self,
+        blocks: Vec<ImmutableBlock<T>>,
+        src_layout: LogicalLayoutHandle,
+    ) -> Result<Self::Output>;
+}
+```
+
+### Block Destination (G2, G3)
+
+For transfers to another `BlockManager`:
+
+```rust,ignore
+struct BlockDestination<Dst: BlockMetadata> {
+    leader: Arc<InstanceLeader>,
+    dst_manager: Arc<BlockManager<Dst>>,
+    src_layout: LogicalLayoutHandle,
+    dst_layout: LogicalLayoutHandle,
+}
+
+impl<Dst: BlockMetadata> TransferDestination for BlockDestination<Dst> {
+    type Output = Vec<ImmutableBlock<Dst>>;
+
+    async fn execute_transfer(&self, blocks: Vec<ImmutableBlock<_>>) -> Result<Self::Output> {
+        // 1. Allocate destination blocks
+        let dst_blocks = self.dst_manager.allocate_blocks(blocks.len())?;
+
+        // 2. Execute transfer via leader
+        let notification = self.leader.execute_local_transfer(
+            self.src_layout,
+            self.dst_layout,
+            src_block_ids,
+            dst_block_ids,
+        )?;
+        notification.await?;
+
+        // 3. Register destination blocks
+        let registered = dst_blocks.into_iter()
+            .zip(sequence_hashes)
+            .map(|(block, hash)| self.dst_manager.register_with_hash(block, hash))
+            .collect();
+
+        Ok(registered)
+    }
+}
+```
+
+### Object Destination (G4)
+
+For transfers to object storage:
+
+```rust,ignore
+struct ObjectDestination {
+    object_ops: Arc<dyn ObjectBlockOps>,
+    src_layout: LogicalLayoutHandle,
+    lock_manager: Option<Arc<dyn ObjectLockManager>>,
+}
+
+impl TransferDestination for ObjectDestination {
+    type Output = Vec<SequenceHash>;
+
+    async fn execute_transfer(&self, blocks: Vec<ImmutableBlock<_>>) -> Result<Self::Output> {
+        // 1. Extract keys and block IDs
+        let keys: Vec<SequenceHash> = blocks.iter().map(|b| b.sequence_hash()).collect();
+        let block_ids: Vec<BlockId> = blocks.iter().map(|b| b.block_id()).collect();
+
+        // 2. Execute object put
+        let results = self.object_ops.put_blocks(keys.clone(), self.src_layout, block_ids).await;
+
+        // 3. Handle lock management
+        if let Some(lock_manager) = &self.lock_manager {
+            for hash in &successful_hashes {
+                lock_manager.create_meta(*hash).await?;
+                lock_manager.release_lock(*hash).await?;
+            }
+        }
+
+        Ok(successful_hashes)
+    }
+}
+```
+
+---
+
+## Batcher Design
+
+### Grouping Containers
+
+The batcher accumulates containers and flushes when:
+- Total blocks reach `max_batch_size`
+- Flush interval expires and `min_batch_size` is met
+- All blocks for a transfer have been processed (sentinel flush)
+
+```rust,ignore
+struct Batcher<T: BlockMetadata> {
+    config: BatchConfig,
+    input_queue: Arc<CancellableQueue<OffloadContainer<T>>>,
+    output_queue: Arc<CancellableQueue<OffloadBatch<T>>>,
+    current_batch: OffloadBatch<T>,
+}
+
+impl<T: BlockMetadata> Batcher<T> {
+    async fn run(mut self) {
+        let mut flush_timer = tokio::time::interval(self.config.flush_interval);
+
+        loop {
+            tokio::select! {
+                _ = flush_timer.tick() => {
+                    self.try_flush().await;
+                }
+                Some(container) = self.input_queue.pop_valid() => {
+                    self.current_batch.containers.push(container);
+
+                    if self.current_batch.total_blocks() >= self.config.max_batch_size {
+                        self.flush().await;
+                    }
+                }
+            }
+        }
+    }
+
+    async fn try_flush(&mut self) {
+        if self.current_batch.total_blocks() >= self.config.min_batch_size {
+            self.flush().await;
+        }
+    }
+
+    async fn flush(&mut self) {
+        if self.current_batch.is_empty() {
+            return;
+        }
+
+        let batch = std::mem::replace(
+            &mut self.current_batch,
+            OffloadBatch { containers: Vec::new() },
+        );
+
+        self.output_queue.push(batch);
+    }
+}
+```
+
+### Preserving Per-Container Cancellability
+
+Each container retains its own `cancel_token`. When the batch is in the executor queue:
+
+1. **Sweep at queue level**: Removes cancelled containers from batches
+2. **Sweep at executor**: Final check before upgrade
+3. **Partial cancellation**: Some containers may be cancelled while others proceed
+
+---
+
+## Extension Rules
+
+### Adding a New Policy
+
+1. Implement the `OffloadPolicy` trait
+2. Add to pipeline configuration
+3. Policy must be fast or async-compatible
+
+```rust,ignore
+trait OffloadPolicy<T: BlockMetadata>: Send + Sync {
+    fn name(&self) -> &str;
+    fn evaluate(&self, ctx: &EvalContext<T>) -> impl Future<Output = Result<bool>>;
+}
+```
+
+### Adding a New Destination Type
+
+1. Implement `TransferDestination` trait
+2. Create a new pipeline variant or use generic executor
+3. Handle destination-specific registration/cleanup
+
+### Maintaining Cancellation Invariants
+
+When modifying the pipeline:
+
+1. **Never skip the upgrade boundary** - It's the commitment point
+2. **Always sweep before upgrade** - Last chance to cancel
+3. **Token must travel with container** - Don't strip it prematurely
+4. **Batches preserve container identity** - Until flat map
+
+---
+
+## Testing Guidance
+
+### Unit Tests
+
+- Test each stage in isolation
+- Mock `CancellationToken` for cancel scenarios
+- Verify sweep removes correct items
+
+### Integration Tests
+
+- Test full pipeline with cancel at each stage
+- Verify no orphaned blocks after cancellation
+- Test partial batch cancellation
+
+### Performance Tests
+
+- Measure overhead of cancellation checks
+- Benchmark sweep operation at scale
+- Profile upgrade → flat map → transfer path
+
+
+
+
+
--- a/lib/kvbm-engine/docs/offload.md
+++ b/lib/kvbm-engine/docs/offload.md
+# Offload Module
+
+The offload module manages the asynchronous transfer of KV cache blocks between storage tiers. It provides a pipeline-based architecture for evaluating, batching, and executing block transfers with full cancellation support.
+
+## Overview
+
+Offloading moves blocks from a source tier (e.g., GPU memory) to a destination tier (e.g., host memory, remote storage, or object storage). The pipeline ensures:
+
+- **Policy-based filtering**: Only blocks meeting criteria are transferred
+- **Batched execution**: Blocks are grouped for efficient transfer
+- **Cancellation support**: Transfers can be cancelled at any point before commitment
+- **Precondition synchronization**: Transfers wait for forward pass completion
+
+## Pipeline Architecture
+
+```text
+┌─────────────────┐     ┌─────────────────────┐     ┌─────────────────────┐     ┌──────────────────┐
+│ PolicyEvaluator │────►│ PreconditionAwaiter │────►│       Batcher       │────►│ TransferExecutor │
+└─────────────────┘     └─────────────────────┘     └─────────────────────┘     └──────────────────┘
+                                                             ▲                          ▲
+                                                             │                          │
+                                                    CancellableQueue          CancellableQueue
+                                                             │                          │
+                                                             └──────── CancelSweeper ───┘
+```
+
+### Stages
+
+| Stage | Purpose |
+|-------|---------|
+| **PolicyEvaluator** | Filters blocks based on configured policies (frequency, presence, etc.) |
+| **PreconditionAwaiter** | Waits for forward pass completion before proceeding |
+| **Batcher** | Groups containers into batches based on total block count |
+| **TransferExecutor** | Upgrades blocks and executes the actual transfer |
+
+## Container Data Model
+
+The fundamental unit flowing through the pipeline is an **OffloadContainer**:
+
+```rust,ignore
+struct OffloadContainer<T: BlockMetadata> {
+    /// The blocks to offload
+    blocks: Vec<SourceBlock<T>>,
+    /// Precondition event (forward pass completion)
+    precondition: Option<EventHandle>,
+    /// Cancellation token
+    cancel_token: CancellationToken,
+}
+```
+
+Containers are grouped into batches for efficient transfer:
+
+```rust,ignore
+struct OffloadBatch<T: BlockMetadata> {
+    /// Multiple containers, each independently cancellable
+    containers: Vec<OffloadContainer<T>>,
+}
+```
+
+
+### P1: Container is the Unit of Cancellation
+
+Individual blocks within a container are not independently cancellable. When a container is cancelled, all its blocks are cancelled together.
+
+### P2: Token Travels with Container
+
+Each container carries its own `CancellationToken`, cloned from the `TransferHandle` at enqueue time. The token travels with the container through all pipeline stages until upgrade.
+
+### P3: Upgrade is the Commitment Boundary
+
+The upgrade step (Weak → Strong) is the point of no return:
+
+- **Before upgrade**: Containers can be cancelled via sweep or token check
+- **After upgrade**: We own the blocks; cancellation no longer applies
+
+### P4: Sweep Before Upgrade
+
+The last cancellation check occurs immediately before upgrade. The `TransferExecutor` calls `batch.sweep_cancelled()` to remove cancelled containers before committing.
+
+### P5: Flat Map After Upgrade
+
+After upgrade, all blocks from all containers are consolidated into a single `Vec<ImmutableBlock<T>>` for efficient batch transfer. Per-container identity is lost at this point.
+
+### P6: PreconditionAwaiter Uses Select
+
+The precondition awaiter can be cancelled via `select!` on both the precondition event and the cancellation token. If cancelled while waiting, the container is dropped immediately.
+
+## Configuration
+
+Pipeline behavior is controlled via `PipelineConfig`:
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `batch_config.max_batch_size` | 64 | Maximum blocks per batch |
+| `batch_config.min_batch_size` | 8 | Minimum blocks before flush |
+| `batch_config.flush_interval` | 10ms | Time before flushing partial batch |
+| `policy_timeout` | 100ms | Timeout for policy evaluation |
+| `sweep_interval` | 10ms | Interval for cancel sweeper |
+| `max_concurrent_transfers` | 1 | Concurrent transfer batches |
+
+## Usage
+
+### Enqueueing Blocks
+
+```rust,ignore
+let handle = pipeline.enqueue(source_blocks, precondition_event);
+
+// Track progress
+println!("Status: {:?}", handle.status());
+
+// Wait for completion
+let result = handle.wait().await?;
+```
+
+### Cancelling a Transfer
+
+```rust,ignore
+// Request cancellation and wait for confirmation
+handle.cancel().await;
+// All blocks are now released
+```
+
+## Related Documentation
+
+- [offload-developer.md](offload-developer.md) - Implementation details and extension rules
+
+
+
+
+
--- a/lib/kvbm-engine/docs/onboarding.md
+++ b/lib/kvbm-engine/docs/onboarding.md
+# Onboarding Guide
+
+Welcome to `kvbm-engine`. This document walks you through the core abstractions
+in the crate so you can orient yourself quickly and start contributing.
+
+`kvbm-engine` is the distributed coordination layer for KV cache block management
+(KVBM). It sits above `kvbm-physical` (which moves bytes) and `kvbm-logical`
+(which tracks block metadata), stitching them together into a system where
+**leaders** make decisions about blocks and **workers** execute data transfers
+across a tiered storage hierarchy:
+
+```text
+G1 (GPU HBM)  →  G2 (Pinned DRAM)  →  G3 (NVMe/SSD)  →  G4 (S3/MinIO)
+```
+
+The central design tension is between **logical** and **physical**. Leaders think
+in sequence hashes and block identities — they never touch raw memory. Workers
+think in layout handles, transfer managers, and DMA descriptors — they never make
+placement decisions. The engine holds these two worlds together.
+
+---
+
+## The Worker
+
+A worker is the physical side of the logical-physical dichotomy. The core
+implementation is `PhysicalWorker`, a thin coordination wrapper around
+`kvbm-physical`.
+
+A `PhysicalWorker` owns:
+
+- A **`TransferManager`** — the `kvbm-physical` engine that actually moves data
+  between memory regions via NIXL (RDMA/UCX), NVMe, or object storage APIs.
+- **Layout handles** for up to three tiers (`g1_handle`, `g2_handle`,
+  `g3_handle`) — these are physical memory region registrations that the transfer
+  manager uses to know *where* data lives on this process.
+- A map of **remote handles** — physical handles imported from peer workers,
+  enabling RDMA pulls.
+
+Workers implement two traits:
+
+**`WorkerTransfers`** defines the transfer operations:
+- `execute_local_transfer(src, dst, block_ids, ...)` — move blocks between tiers
+  within this worker (e.g. G2 → G1).
+- `execute_remote_onboard(remote_desc, dst, block_ids, ...)` — RDMA pull from a
+  remote worker into a local layout.
+- `execute_remote_offload(src, remote_desc, block_ids, ...)` — push local data
+  to a remote descriptor.
+- `connect_remote(instance_id, metadata)` — import a peer's NIXL metadata so we
+  can do RDMA to/from them.
+
+**`Worker`** extends `WorkerTransfers` with layout handle accessors and metadata
+import/export for RDMA setup.
+
+All transfer operations return a `TransferCompleteNotification` — an async handle
+you await to know when the data movement is done. This is how the system achieves
+overlap between transfer scheduling and transfer execution.
+
+---
+
+## Workers as Remote Services (Velo)
+
+In a multi-process deployment, each worker runs in its own process. Rather than
+calling `PhysicalWorker` methods directly, we wrap it as a Velo RPC service.
+
+**`VeloWorkerService`** takes a `PhysicalWorker` and registers handlers for
+every `WorkerTransfers` and `Worker` method (e.g. `kvbm.worker.local_transfer`,
+`kvbm.worker.remote_onboard`, etc.). The service lives in the worker process.
+
+**`VeloWorkerClient`** implements the same `Worker` trait but serializes each
+call into a Velo message, sends it to the remote service, and returns a
+`TransferCompleteNotification` backed by a completion event.
+
+The key insight: **from the leader's perspective, local and remote workers are
+interchangeable.** Both implement `Worker`. The leader never knows (or cares)
+whether it is talking to an in-process `PhysicalWorker` or a `VeloWorkerClient`
+that crosses a process boundary.
+
+```text
+Leader process                          Worker process
+┌───────────────────┐                   ┌───────────────────┐
+│  InstanceLeader   │                   │                   │
+│        │          │                   │                   │
+│  CoordinatedWorker│                   │                   │
+│        │          │                   │                   │
+│  VeloWorkerClient │ ── Velo RPC ──▶  │ VeloWorkerService │
+│                   │                   │        │          │
+│                   │                   │  PhysicalWorker   │
+│                   │                   │  (TransferManager)│
+└───────────────────┘                   └───────────────────┘
+```
+
+There is one more wrapper to mention: **`CoordinatedWorker`**. This lives in the
+leader process and adds coordination state on top of a `Worker` (local or
+remote). It tracks the leader's view of which layout handles map to which
+remote instances and ranks. When the leader says "pull blocks from Instance B,
+rank 0", the `CoordinatedWorker` resolves the correct physical handle and
+delegates to the inner `Worker`.
+
+---
+
+## Worker Groups
+
+Workers can be organized into groups that present a single-worker interface to
+the leader. The `ParallelWorkers` trait is the group-level analog of `Worker`.
+
+### Tensor Parallel (SPMD)
+
+`SpmdParallelWorkers` is the default group implementation. It broadcasts every
+operation to all N workers in parallel — the SPMD (Single Program, Multiple Data)
+model.
+
+In a typical tensor-parallel deployment, each GPU holds its own shard of every
+KV cache block. When the leader says "transfer blocks [1, 2, 3] from G2 to G1",
+the SPMD group fans this out to every rank. Each rank executes the same transfer
+on its own shard. Results are aggregated before returning to the leader.
+
+```text
+Leader: "transfer blocks 1,2,3 from G2 → G1"
+         │
+   SpmdParallelWorkers
+         │
+    ┌────┼────┐
+    ▼    ▼    ▼
+  Rank0 Rank1 Rank2    (each transfers its own shard)
+```
+
+### Replicated Data (MLA)
+
+For Multi-head Latent Attention (MLA), KV data is replicated rather than sharded.
+The `ReplicatedDataWorker` (feature-gated behind `collectives`) implements a
+different strategy:
+
+- **Rank 0** is the only worker with G2 and G3 storage. It performs all
+  tier-to-tier transfers (G3 → G2 → G1).
+- **Ranks 1..N** only have G1. They receive data from rank 0 via NCCL
+  `broadcast`.
+
+This means the leader can still say "onboard these blocks" and the group handles
+the asymmetry internally — rank 0 does the heavy lifting, then broadcasts to
+everyone else.
+
+### The Power of the Abstraction
+
+These two strategies — symmetric sharding and replicated broadcast — are very
+different physically, but the leader drives both through the same
+`ParallelWorkers` / `WorkerTransfers` interface. This is the core value of worker
+groups: **different parallelism strategies behind a uniform API**.
+
+The abstraction is admittedly incomplete — more parallelism patterns will need
+more group implementations — but it is sufficient for the two use cases presented
+and demonstrates the pattern for extending it.
+
+---
+
+## The Leader
+
+The leader is the logical counterpart to the worker. `InstanceLeader` owns the
+logical view of all block data, regardless of how it is physically distributed
+across workers and tiers.
+
+An `InstanceLeader` holds:
+
+- A **`BlockRegistry`** for deduplication — tracks which sequence hashes have
+  been seen.
+- A **`BlockManager<G2>`** (required) and optional **`BlockManager<G3>`** — the
+  logical block stores for host DRAM and disk.
+- A list of **workers** (via `CoordinatedWorker`) and an optional
+  **`SpmdParallelWorkers`** group.
+- A map of **sessions** for distributed onboarding (more on this below).
+- Optional **remote leader** references for cross-instance coordination.
+
+### find_matches
+
+The core entry point is `find_matches(sequence_hashes)`. Given a list of
+sequence hashes, the leader determines which blocks already exist and where:
+
+1. Search the local G2 `BlockManager` for matches.
+2. Search the local G3 `BlockManager` for any remaining hashes.
+3. Optionally search remote leaders via distributed sessions.
+
+The result is either:
+- **`Ready`** — all requested blocks were found locally in G2; the caller gets
+  immediate RAII `BlockHolder`s.
+- **`AsyncSession`** — some blocks require staging (G3 → G2) or remote transfers;
+  the caller gets a session handle with a status watch channel.
+
+### BlockHolder (RAII Ownership)
+
+`BlockHolder<T>` (where T is `G2` or `G3`) is an RAII guard that holds blocks
+during a session. While held, those blocks cannot be evicted. When the holder is
+dropped, blocks are released. This prevents leaks even if session handling
+panics.
+
+### Block Scanning
+
+`InstanceLeader` also exposes `scan_with_policy` — a flexible iteration
+mechanism where the caller provides a closure that searches for blocks using a
+`BlockAccessor` (which wraps both G2 and G3 managers) and yields results through
+a `PolicyContext`. This enables custom scanning strategies (contiguous runs,
+LFU-sorted scans) without exposing block manager internals.
+
+---
+
+## Instances
+
+An **Instance** is the deployment unit: one leader plus its workers.
+
+```text
+┌─ Instance (TP=2) ──────────────────────────┐
+│                                             │
+│   InstanceLeader                            │
+│       │                                     │
+│   SpmdParallelWorkers                       │
+│       ├── Worker (rank 0, GPU 0)            │
+│       └── Worker (rank 1, GPU 1)            │
+│                                             │
+└─────────────────────────────────────────────┘
+```
+
+In a single-GPU setup, the instance is simply one leader and one worker.
+In tensor-parallel, it is one leader driving an SPMD group.
+
+The leader drives; the workers execute. The leader never touches bytes; the
+workers never make placement decisions.
+
+---
+
+## Transfer Classification
+
+Transfers fall into three classes based on scope:
+
+### Local (intra-worker, intra-instance)
+
+Tier-to-tier transfers within a single worker: G1 ↔ G2, G2 ↔ G3, etc.
+
+This is the bread and butter of a tensor-parallel deployment. Each worker
+independently moves its own shard between tiers. The SPMD group broadcasts the
+same logical operation to all ranks, and each rank executes it on its own
+physical layouts.
+
+### Intra (inter-worker, intra-instance)
+
+Transfers between workers within the same instance. The motivating example is the
+MLA/replicated data pattern: rank 0 performs a G3 → G2 → G1 transfer, then
+NCCL broadcasts its G1 data to all other ranks. The data crosses worker
+boundaries but stays within the same instance.
+
+### Inter (inter-worker, inter-instance)
+
+Transfers between workers on different instances. This is **distributed KVBM** —
+the peer-to-peer model described in the next section.
+
+```text
+           ┌──────────────────────────────┐
+           │          Local               │
+           │    (intra-worker, intra-inst) │
+           │     G2 ←→ G1 on Rank 0      │
+           └──────────────────────────────┘
+
+  ┌──────────────────────────────────────────────┐
+  │              Intra                            │
+  │       (inter-worker, intra-inst)              │
+  │   Rank 0 ──NCCL bcast──▶ Rank 1..N           │
+  └──────────────────────────────────────────────┘
+
+  ┌──────────────────────────────────────────────────────────────┐
+  │                       Inter                                  │
+  │              (inter-worker, inter-inst)                       │
+  │   Instance A, Rank 0 ──RDMA──▶ Instance B, Rank 0           │
+  └──────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Distributed KVBM (Inter-Instance Transfers)
+
+Distributed KVBM is a peer-to-peer model where two or more instances coordinate
+block ownership through **sessions**, then trigger direct worker-to-worker
+transfers.
+
+### Sessions
+
+A session is a short-lived coordination protocol between two instances. There are
+two roles:
+
+- **`InitiatorSession`** — the requesting side (e.g. a Prefill instance that
+  needs blocks).
+- **`ResponderSession`** — the providing side (e.g. a Decode instance that has
+  blocks cached).
+
+Sessions progress through a state machine:
+
+```text
+Searching ──▶ Holding ──▶ Staging ──▶ Ready ──▶ Complete
+                                                  │
+                                             (or Failed)
+```
+
+- **Searching**: The initiator asks the responder to search its local block
+  managers.
+- **Holding**: The responder has found blocks and holds them via `BlockHolder` to
+  prevent eviction.
+- **Staging**: G3 → G2 promotion is in progress on the responder (if blocks were
+  on disk). NIXL descriptors are prepared for RDMA.
+- **Ready**: Blocks are in G2 on the responder and RDMA-ready.
+- **Complete**: The initiator has pulled all blocks. The session is torn down.
+
+### Worked Example: TP=2 Cross-Instance Transfer
+
+Suppose Instance A (Prefill, TP=2) wants KV blocks for sequence hashes
+`[hash_1, hash_2]` from Instance B (Decode, TP=2).
+
+```text
+Instance A (Prefill, TP=2)              Instance B (Decode, TP=2)
+┌──────────────────────┐                ┌──────────────────────┐
+│ Leader A             │                │ Leader B             │
+│  ├─ Worker A0 (GPU0) │                │  ├─ Worker B0 (GPU0) │
+│  └─ Worker A1 (GPU1) │                │  └─ Worker B1 (GPU1) │
+└──────────────────────┘                └──────────────────────┘
+```
+
+The flow:
+
+1. **Leader A creates a session** with Leader B, sending the sequence hashes
+   `[hash_1, hash_2]` it is looking for.
+
+2. **Leader B receives the request** (`ResponderSession`). It searches its G2
+   and G3 block managers for matches.
+
+3. **Leader B acquires ownership** of the matched blocks via `BlockHolder`,
+   preventing eviction during the transfer.
+
+4. **Leader B responds** with what it found: which hashes matched, their
+   tier locations, and NIXL descriptors that allow RDMA access to the G2 blocks.
+
+5. **Leader A instructs its workers to pull.** Since both instances use TP=2, the
+   mapping is 1:1 — rank 0 on A pulls from rank 0 on B, rank 1 on A pulls from
+   rank 1 on B. Each pull is a direct RDMA transfer between the worker processes
+   using NIXL.
+
+6. **Session completes.** Leader B releases its `BlockHolder`s. Leader A now has
+   the blocks in its own G2.
+
+The rank mapping is handled by `route_local_to_remote` in `LeaderState`, which
+supports asymmetric configurations too (e.g. TP=4 pulling from TP=2).
+
+### Transport
+
+Session messages travel over **Velo** (the project's RPC framework).
+`VeloLeaderService` registers handlers for `kvbm.leader.onboard`,
+`kvbm.leader.remote_session`, and `kvbm.leader.session` — these dispatch
+incoming messages to the appropriate per-session channels.
+
+For testing, `LocalTransport` provides direct in-process dispatch without
+network overhead.
+
+---
+
+## Objects vs Blocks
+
+Throughout the crate, you will encounter two distinct representations of
+KV cache data:
+
+### Blocks
+
+A **Block** is the fundamental unit within tiers G1–G3. It is identified by a
+`BlockId`, associated with a `SequenceHash`, and managed by a `BlockManager`.
+Blocks have physical backing (GPU HBM, pinned DRAM, or NVMe) and support
+direct memory transfers via NIXL. The `BlockManager` handles allocation,
+eviction, and frequency tracking. Blocks are the hot-path, low-latency
+representation.
+
+### Objects
+
+An **Object** is the G4 (S3/MinIO) representation. Objects are addressed by
+**key** (derived from a `SequenceHash` via a `KeyFormatter`), not by `BlockId`.
+The `ObjectBlockOps` trait defines the interface: `has_blocks`, `put_blocks`,
+`get_blocks`.
+
+Objects exist because S3 does not support the block-oriented, handle-based access
+pattern of the lower tiers. They provide unlimited-capacity cold storage at the
+cost of higher latency and a key-value access model.
+
+For SPMD deployments, the `RankPrefixedKeyFormatter` prefixes each object key
+with the worker rank (`{rank}/{hash}`), so each worker's shard is stored
+independently.
+
+The `ObjectLockManager` provides distributed locking for G4 writes using
+conditional S3 PUTs, preventing duplicate uploads across concurrent instances.
+
+---
+
+## Where to Go Next
+
+Now that you have the conceptual model, dive into the per-module documentation
+for implementation details:
+
+| Document | Covers |
+|----------|--------|
+| [architecture.md](architecture.md) | Tier model, module map, feature flags, quick start |
+| [leader.md](leader.md) | `Leader` trait, `InstanceLeader`, `FindMatchesResult`, staging modes |
+| [worker.md](worker.md) | `Worker` / `WorkerTransfers`, `PhysicalWorker`, `CoordinatedWorker`, Velo layer |
+| [worker-group.md](worker-group.md) | `SpmdParallelWorkers`, fan-out, rank-aware routing |
+| [session.md](session.md) | Session protocol, initiator/responder/controllable, message types, state machine |
+| [offload.md](offload.md) | Offload pipeline stages, policies, cancellation |
+| [object.md](object.md) | G4 storage, S3 client, lock manager |
+| [runtime.md](runtime.md) | `KvbmRuntime` construction and shared infrastructure |
+| [testing.md](testing.md) | Test utilities, multi-instance fixtures, RDMA transfer tests |
+
+To run the test suite:
+
+```bash
+cargo test -p kvbm-engine --features testing
+```
--- a/lib/kvbm-engine/docs/runtime.md
+++ b/lib/kvbm-engine/docs/runtime.md
+# Runtime
+
+The `KvbmRuntime` is the composed shared infrastructure for KVBM operations. It bundles
+the minimal set of components that all downstream managers and services need:
+
+- **Tokio runtime** -- async execution context (owned or borrowed handle)
+- **Messenger (Velo)** -- distributed RPC for leader/worker communication and peer discovery
+- **NixlAgent** -- RDMA/UCX data transfers (optional, disabled when NixL config is absent)
+- **EventManager** -- worker coordination and transfer completion notifications (accessed via Messenger)
+
+## Construction
+
+Two quick constructors cover the common case:
+
+```rust,ignore
+// Leader role (reads KVBM_* env vars + TOML files)
+let runtime = KvbmRuntime::from_env_leader().await?;
+
+// Worker role
+let runtime = KvbmRuntime::from_env_worker().await?;
+```
+
+For tests or custom setups, use the builder:
+
+```rust,ignore
+let config = KvbmConfig::from_env()?;
+let runtime = KvbmRuntime::builder(config)
+    .with_runtime_handle(Handle::current())   // inject existing tokio runtime
+    .with_messenger(messenger)                // inject pre-built Messenger
+    .with_nixl_agent(agent)                   // inject pre-built NixlAgent
+    .build_leader()
+    .await?;
+```
+
+`KvbmRuntimeBuilder::from_json(json)` is the primary entrypoint for vLLM's
+`kv_connector_extra_config` dict -- JSON values have highest priority, overriding
+env vars, TOML files, and defaults.
+
+## Component access
+
+| Method              | Returns                      | Notes                                 |
+|---------------------|------------------------------|---------------------------------------|
+| `handle()` / `tokio()` | `tokio::runtime::Handle`  | Borrowed or owned runtime handle      |
+| `messenger()`       | `&Arc<Messenger>`            | Velo RPC                              |
+| `nixl_agent()`      | `Option<&NixlAgent>`        | `None` when NixL disabled in config   |
+| `event_system()`    | `Arc<velo::EventManager>`   | From Messenger, used for transfer notifications |
+| `config()`          | `&KvbmConfig`               | Full configuration snapshot            |
+
+## RuntimeHandle
+
+`RuntimeHandle` is an enum that abstracts over owned (`Arc<Runtime>`) and borrowed
+(`Handle`) tokio runtimes. The builder creates an owned runtime from config when none
+is injected.
--- a/lib/kvbm-engine/docs/session.md
+++ b/lib/kvbm-engine/docs/session.md
+# Session Module
+
+The session module manages distributed block transfer sessions between
+instances. Sessions coordinate the search, staging, and RDMA transfer of
+KV cache blocks between a requesting instance (Prefill) and a serving
+instance (Decode).
+
+## Protocol Overview
+
+### Onboard Protocol (InitiatorSession ↔ ResponderSession)
+
+Multi-peer search and staging using `OnboardMessage`:
+
+```text
+  Initiator (Prefill)              Responder (Decode)
+        │                                │
+        │──── CreateSession ────────────▶│
+        │                                │  search local G2/G3
+        │◀─── G2Results ────────────────│
+        │◀─── G3Results ────────────────│  (if G3 blocks found)
+        │◀─── SearchComplete ───────────│
+        │                                │
+        │──── HoldBlocks ───────────────▶│
+        │◀─── Acknowledged ─────────────│
+        │                                │
+        │──── StageBlocks ──────────────▶│  G3→G2 staging (optional)
+        │◀─── BlocksReady ──────────────│
+        │                                │
+        │     RDMA pull (remote G2→local G2)
+        │                                │
+        │──── CloseSession ─────────────▶│
+```
+
+When G4 (object storage) is configured, the initiator also runs a parallel
+G4 search via internal `G4Results`/`G4LoadComplete` messages (not sent over
+the network).
+
+### Unified Session Protocol (SessionHandle ↔ ServerSession)
+
+Point-to-point session using `SessionMessage`:
+
+```text
+  Controller (Prefill)             ServerSession (Decode)
+        │                                │
+        │──── Attach ───────────────────▶│
+        │◀─── StateResponse ────────────│  (current state snapshot)
+        │                                │
+        │──── TriggerStaging ───────────▶│  (if G3 blocks pending)
+        │◀─── BlocksStaged ────────────│  (newly staged blocks)
+        │                                │
+        │     RDMA pull (remote G2→local G2)
+        │                                │
+        │──── BlocksPulled ─────────────▶│  (release pulled blocks)
+        │──── Detach ───────────────────▶│
+```
+
+Control can be transferred bidirectionally via `YieldControl`/`AcquireControl`.
+For layerwise transfer, `BlocksStaged` includes an optional `layer_range`.
+
+## Session Types
+
+| Session | Role | Protocol | Description |
+|---------|------|----------|-------------|
+| **ServerSession** | Holds blocks, exposes for pull | SessionMessage | Merged from former EndpointSession + ControllableSession |
+| **SessionHandle** | Client-side control | SessionMessage | Attach/detach, state queries, RDMA pulls |
+| **InitiatorSession** | Multi-peer search orchestrator | OnboardMessage | Created by InstanceLeader for distributed search |
+| **ResponderSession** | Responds to search requests | OnboardMessage | Searches local G2/G3, holds blocks, stages on request |
+
+### ServerSession
+
+Server-side session that holds blocks and exposes them for remote RDMA pull.
+Supports two modes:
+
+- **G2-only**: Blocks are already in G2 with pre-assigned layout handles
+  (`BlockMetadataMap::Direct`). `TriggerStaging` is a no-op. Created via
+  `ServerSession::new_g2_only()` or the `create_server_session()` factory.
+- **Staging**: G3 blocks need to be staged to G2. Layout handles are assigned
+  round-robin across workers (`BlockMetadataMap::RoundRobin`). Supports
+  `auto_stage` option via `ServerSessionOptions`. Created via
+  `ServerSession::new_with_staging()`.
+
+Created from `InstanceLeader` via:
+- `create_endpoint_session()` / `create_endpoint_session_for_blocks()` — G2-only
+- `create_controllable_session()` / `create_controllable_session_with_options()` — with staging
+
+**ServerSessionHandle** provides local control: `notify_layers_ready()` for
+layerwise transfer notifications, and `close()` for graceful shutdown.
+
+### InitiatorSession
+
+The requesting side. Sends `CreateSession` to one or more remote instances,
+collects results, applies first-responder-wins deduplication, and orchestrates
+staging and RDMA pulls. Supports three staging modes:
+- **Hold**: Find and hold blocks (G2+G3), no staging
+- **Prepare**: Stage G3→G2 everywhere, keep session alive
+- **Full**: Stage G3→G2 + RDMA pull remote G2→local G2, session completes
+
+Created by `InstanceLeader::find_matches_with_options()` when
+`search_remote == true`.
+
+### ResponderSession
+
+The serving side. Receives `CreateSession`, searches local block managers
+(G2 then G3 for remaining), holds matched blocks via `BlockHolder`, and
+responds with match results. Handles staging requests and keeps blocks
+alive until the session ends.
+
+## Core Building Blocks
+
+### BlockHolder
+
+RAII container for holding blocks during sessions. Tier-agnostic (`BlockHolder<G2>`,
+`BlockHolder<G3>`). Blocks are automatically released when the holder is dropped,
+preventing leaks even if session handling panics. Key operations: `retain()`,
+`release()`, `extend()`, `take_all()`.
+
+### SessionEndpoint
+
+Point-to-point session primitive with a state machine. Encapsulates:
+- Identity (`session_id`, `instance_id`)
+- State machine (`ControlRole` + `AttachmentState` + `SessionPhase`)
+- Message receive channel (`mpsc::Receiver<SessionMessage>`)
+- State publication via watch channel
+- Transport for sending messages to peer
+
+Used internally by `ServerSession`. Does NOT handle block holding or staging
+logic — those are the caller's responsibility.
+
+### SessionHandle
+
+Handle for controlling a remote session. Supports:
+- State observation: `current_state()`, `wait_for_ready()`, `wait_for_complete()`
+- Control commands: `trigger_staging()`, `mark_blocks_pulled()`, `detach()`
+- Bidirectional control: `yield_control()`, `acquire_control()`
+- RDMA transfers: `pull_blocks_rdma()`, `pull_blocks_rdma_with_options()`
+
+Used by the controller side (Prefill) to drive a remote `ServerSession` (Decode).
+
+### SessionHandleStateTx
+
+Sender side of the state observation channel. Used by the session receiver
+task to forward `StateResponse` and `BlocksStaged` messages into the
+watch channel that `SessionHandle` observes.
+
+### Staging
+
+Shared G3→G2 staging logic extracted into `staging::stage_g3_to_g2()`.
+Core kernel: allocate G2 destinations → execute local transfer (G3→G2) →
+register new blocks with source sequence hashes. Used by `InitiatorSession`,
+`ResponderSession`, and `ServerSession` to avoid code duplication.
+
+## Transport Layer
+
+`MessageTransport` is an enum with two variants:
+
+- **`VeloTransport`**: Uses Velo active messages for distributed
+  communication between instances.
+- **`LocalTransport`**: Direct channel dispatch for in-process testing
+  without network overhead.
+
+Methods:
+- `send()` — Send an `OnboardMessage` to a target instance
+- `send_session()` — Send a `SessionMessage` to a target instance
+- `request_metadata()` — RPC call to get remote worker layout metadata for RDMA
+
+## Message Types
+
+| Type | Direction | Purpose |
+|------|-----------|---------|
+| `OnboardMessage` | Initiator ↔ Responder | Block search, hold/drop, staging requests |
+| `SessionMessage` | Controller ↔ ServerSession | Attach/detach, control transfer, block ops, state sync |
+
+### OnboardMessage Variants
+
+| Variant | Sender | Description |
+|---------|--------|-------------|
+| `CreateSession` | Initiator | Start new session with sequence hashes |
+| `G2Results` | Responder | G2 search matches (sequence hashes + block IDs) |
+| `G3Results` | Responder | G3 search matches (sequence hashes only) |
+| `SearchComplete` | Responder | All local searching done |
+| `HoldBlocks` | Initiator | Which blocks to hold vs drop |
+| `Acknowledged` | Responder | Hold/drop processed |
+| `StageBlocks` | Initiator | G3 hashes to stage to G2 |
+| `BlocksReady` | Responder | Newly staged G2 blocks ready |
+| `ReleaseBlocks` | Initiator | Release specific blocks |
+| `CloseSession` | Initiator | Session complete, clean up |
+| `G4Results` | Internal | Object storage search results (not sent over network) |
+| `G4LoadComplete` | Internal | Object storage load results (not sent over network) |
+
+### SessionMessage Variants
+
+| Variant | Category | Description |
+|---------|----------|-------------|
+| `Attach` | Connection | Peer attaches with a control role |
+| `Detach` | Connection | Peer detaches gracefully |
+| `YieldControl` | Control | Sender yields controller role |
+| `AcquireControl` | Control | Sender acquires controller role |
+| `TriggerStaging` | Block ops | Request G3→G2 staging |
+| `HoldBlocks` | Block ops | Request blocks be held |
+| `ReleaseBlocks` | Block ops | Release specific blocks |
+| `BlocksPulled` | Block ops | Notify blocks were pulled via RDMA |
+| `StateResponse` | State sync | Full state snapshot (phase, role, blocks) |
+| `BlocksStaged` | State sync | Newly staged blocks (with optional layer range) |
+| `Close` | Lifecycle | Graceful session close |
+| `Error` | Lifecycle | Report error |
+
+## State Machine
+
+### SessionPhase
+
+Lifecycle of block operations. Staging is optional — blocks already in the
+target tier (G2) skip it:
+
+```text
+Searching → Holding ──────────────────── Ready → Complete
+                    └── Staging ────────┘
+                    └── Complete  (direct pull, no staging needed)
+                    └── Failed
+```
+
+### ControlRole
+
+Dynamic role in session relationship:
+- `Neutral` — Initial state, can transition either way
+- `Controller` — Issues commands to peer
+- `Controllee` — Executes commands from peer
+
+Supports bidirectional transfer via `YieldControl`/`AcquireControl`.
+
+### AttachmentState
+
+Peer connection state: `Unattached` (waiting) or `Attached { peer }` (connected).
+
+## Dispatch Functions
+
+- **`dispatch_onboard_message`**: Routes `OnboardMessage` to per-session task
+  channels by session ID. Used by the Velo onboard handler.
+- **`dispatch_session_message`**: Routes `SessionMessage` to per-session task
+  channels by session ID. Used by the Velo session handler.
+
+## File Structure
+
+```text
+session/
+├── mod.rs              # Module declarations, dispatch functions, re-exports
+├── blocks.rs           # BlockHolder<T> — RAII block container
+├── endpoint.rs         # SessionEndpoint — state machine primitive
+├── handle.rs           # SessionHandle + SessionHandleStateTx
+├── server_session.rs   # ServerSession + ServerSessionHandle + BlockMetadataMap
+├── staging.rs          # Shared stage_g3_to_g2() function
+├── state.rs            # SessionPhase, ControlRole, AttachmentState
+├── messages.rs         # OnboardMessage, SessionMessage, BlockInfo, etc.
+├── initiator.rs        # InitiatorSession (multi-peer orchestrator)
+├── responder.rs        # ResponderSession (search + hold + stage)
+└── transport.rs        # MessageTransport (Velo + Local)
+```
--- a/lib/kvbm-engine/docs/testing.md
+++ b/lib/kvbm-engine/docs/testing.md
+# Testing Module
+
+Test infrastructure for the kvbm-engine crate. Core block and token utilities
+are re-exported from `kvbm_logical::testing` and `kvbm_physical::testing`;
+this module adds engine-specific helpers for transport, sessions, offload
+pipelines, and multi-instance scenarios.
+
+## Test Helpers
+
+### TestManagerBuilder / TestRegistryBuilder
+
+Create test block managers and registries with synthetic physical layouts.
+`TestManagerBuilder` produces a `BlockManager<T>` backed by mock memory.
+`TestRegistryBuilder` produces a `BlockRegistry` pre-populated with hashes.
+
+Use `populate_manager_with_blocks` and `create_and_populate_manager` to
+quickly set up managers with pre-allocated blocks for testing.
+
+### MessengerPair
+
+Creates a pair of connected Velo `Messenger` instances for transport
+testing without a real network. Messages sent through one messenger are
+received by the other, enabling end-to-end session testing in a single
+process.
+
+```rust,ignore
+let (messenger_a, messenger_b) = create_messenger_pair_tcp().await?;
+```
+
+### TestSession
+
+Helper for testing distributed session protocols. Sets up the full session
+infrastructure (dispatch maps, transport, channels) for testing
+`InitiatorSession` / `ResponderSession` / `ControllableSession` interactions.
+
+### EventsPipelineFixture
+
+Test fixture for the offload pipeline. Provides pre-configured pipeline
+stages, event managers, and block managers for testing policy evaluation,
+batching, and transfer execution in isolation.
+
+### MultiInstancePopulator
+
+Sets up multi-instance distributed test scenarios with multiple leaders,
+workers, and block managers. Populates each instance with configurable
+block patterns for testing cross-instance onboarding.
+
+```rust,ignore
+let populated = MultiInstancePopulator::builder()
+    .instance_count(3)
+    .blocks_per_instance(100)
+    .build()?
+    .populate()
+    .await?;
+```
+
+### Physical Test Utilities
+
+`TestAgent` and `TestAgentBuilder` create mock `NixlAgent` instances for
+testing `TransferManager` without real RDMA hardware. `TransferChecksums`
+provides utilities for verifying transfer correctness.
+
+### Token Block Helpers
+
+The `token_blocks` module provides utilities for creating test blocks with
+known token sequences, useful for verifying search and match operations.
+
+## Writing a New Test
+
+1. Choose the appropriate fixture for your test scope:
+   - Single-instance transfer → `TestManagerBuilder` + `TestAgent`
+   - Session protocol → `TestSession` + `MessengerPair`
+   - Offload pipeline → `EventsPipelineFixture`
+   - Multi-instance → `MultiInstancePopulator`
+2. Build the fixture and populate with test data
+3. Exercise the code under test
+4. Assert on results and verify cleanup (blocks released, sessions closed)
--- a/lib/kvbm-engine/docs/worker-group.md
+++ b/lib/kvbm-engine/docs/worker-group.md
+# Worker Group Module
+
+The worker group module provides abstractions for driving multiple workers
+in parallel from a single leader.
+
+## ParallelWorkers Trait
+
+`ParallelWorkers` extends `WorkerTransfers + ObjectBlockOps` for cohorts of
+workers. It adds:
+
+- `export_metadata()` → `Vec<SerializedLayoutResponse>` (one per rank)
+- `import_metadata(Vec<SerializedLayout>)` → `Vec<ImportMetadataResponse>`
+- `worker_count()` → number of workers
+- `workers()` → slice of underlying `Arc<dyn Worker>`
+
+## SpmdParallelWorkers
+
+`SpmdParallelWorkers` implements the SPMD (Single Program, Multiple Data)
+execution model: the same operation is broadcast to every worker in parallel,
+and results are aggregated.
+
+### Fan-out Execution
+
+Every `WorkerTransfers` method (local transfer, remote onboard, remote
+offload) iterates over all workers and calls the same operation on each.
+Workers execute in parallel – each resolves the shared logical layout handle
+to its own physical layout.
+
+### Rank-aware Routing
+
+For `connect_remote`, each worker receives its rank-specific metadata slice.
+Remote handle mappings are stored as `(InstanceId, worker_idx,
+LogicalLayoutHandle) → LayoutHandle`, so `execute_remote_onboard_for_instance`
+can look up the correct remote handle for each worker by rank.
+
+### Event Aggregation
+
+Transfer completion notifications from individual workers are aggregated into
+a single `TransferCompleteNotification` via the event system. The aggregated
+notification fires only when all workers have completed.
+
+### ObjectBlockOps Aggregation
+
+- `has_blocks`: queries all workers, returns results from worker 0 (all should
+  agree in SPMD semantics).
+- `put_blocks` / `get_blocks`: executes on all workers in parallel. A key
+  succeeds only if **all** workers succeed for that key.
+
+### Construction
+
+```rust,ignore
+let parallel = SpmdParallelWorkers::new(
+    workers,        // Vec<Arc<dyn Worker>>, one per rank
+    event_manager,  // Arc<EventManager> for aggregation
+    runtime_handle, // tokio::runtime::Handle for spawning
+);
+```
--- a/lib/kvbm-engine/docs/worker.md
+++ b/lib/kvbm-engine/docs/worker.md
+# Worker Module
+
+The worker module defines execution primitives for data transfers between
+storage tiers. Workers own the physical resources (transfer managers, layout
+handles) needed to move blocks via RDMA, local copy, or object storage.
+
+## Trait Hierarchy
+
+```text
+WorkerTransfers          Worker
+  (execution)       (metadata + handles)
+       │                    │
+       └────────┬───────────┘
+                │
+         ObjectBlockOps
+          (G4 storage)
+```
+
+- **`WorkerTransfers`** – core execution trait. Provides `execute_local_transfer`,
+  `execute_remote_onboard`, `execute_remote_offload`, `connect_remote`, and
+  `execute_remote_onboard_for_instance`.
+- **`Worker`** – extends `WorkerTransfers + ObjectBlockOps`. Adds layout handle
+  accessors (`g1_handle`, `g2_handle`, `g3_handle`) and metadata import/export.
+
+## PhysicalWorker (aka DirectWorker)
+
+`PhysicalWorker` is the fundamental single-worker implementation. It directly
+owns a `TransferManager` and layout handles for executing data movement.
+
+### Builder
+
+```rust,ignore
+let worker = PhysicalWorker::builder()
+    .manager(transfer_manager)   // required
+    .g1_handle(g1)               // optional – GPU tier
+    .g2_handle(g2)               // optional – host tier
+    .g3_handle(g3)               // optional – disk tier
+    .rank(0)                     // optional – for SPMD key prefixing
+    .object_client(s3_client)    // optional – for G4 operations
+    .build()?;
+```
+
+| Field | Required | Purpose |
+|-------|----------|---------|
+| `manager` | yes | `TransferManager` for executing transfers |
+| `g1_handle` | no | GPU/HBM layout handle |
+| `g2_handle` | no | Host/pinned-DRAM layout handle |
+| `g3_handle` | no | Disk/NVMe layout handle |
+| `rank` | no | Worker rank for SPMD key prefixing |
+| `object_client` | no | G4 object storage client |
+
+`DirectWorker` is a compatibility alias for `PhysicalWorker`.
+
+### Execution State vs Coordination State
+
+PhysicalWorker maintains **execution state** – the handles and manager needed
+to actually perform transfers. This is distinct from **coordination state**
+which the leader tracks in `CoordinatedWorker`. When a leader wraps a
+PhysicalWorker in a CoordinatedWorker, handles exist in both places
+intentionally: PhysicalWorker needs them to call TransferManager, while
+CoordinatedWorker provides a uniform API for both local and remote workers.
+
+## CoordinatedWorker
+
+`CoordinatedWorker` is the leader's view of a worker. It wraps any `Worker`
+implementation and adds coordination state:
+
+- Local layout handles (populated via `apply_layout_response`)
+- Remote handle mappings for cross-leader RDMA transfers
+- Worker rank and host instance tracking
+
+This wrapper lets the leader use the same API regardless of whether the
+underlying worker is local (`PhysicalWorker`) or remote (`VeloWorkerClient`).
+
+## VeloWorkerClient / VeloWorkerService
+
+The Velo (RPC) layer enables remote worker execution:
+
+- **`VeloWorkerService`** – wraps a `PhysicalWorker` and exposes RPC handlers
+  for `execute_local_transfer`, `export_metadata`, `import_metadata`, etc.
+- **`VeloWorkerClient`** – implements `WorkerTransfers` by sending RPC
+  requests to a remote `VeloWorkerService`.
+
+Together they allow the leader to drive workers on remote nodes as if they
+were local.
--- a/lib/kvbm-engine/scripts/bench_viewer.html
+++ b/lib/kvbm-engine/scripts/bench_viewer.html
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>KVBM Bench Engine Results Viewer</title>
+<style>
+*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
+:root{
+  --bg:#f9f9f7;--card:#fff;--border:#e0e0e0;--text:#1a1a1a;--text2:#555;
+  --green:#76B900;--green-dark:#4A7A00;--blue:#0074DF;--blue-dark:#1565C0;
+  --orange:#EF9100;--orange-dark:#C67800;--red:#E52020;--red-dark:#C62828;
+  --nav:#1a1a1a;--radius:16px;--font:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;
+  --mono:'SF Mono','Cascadia Code',Consolas,monospace;
+}
+html{scroll-behavior:smooth;scroll-padding-top:56px}
+body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5}
+
+/* Nav */
+nav{position:sticky;top:0;z-index:100;background:var(--nav);display:flex;align-items:center;gap:4px;padding:0 16px;height:48px;overflow-x:auto;white-space:nowrap}
+nav a{color:#ccc;text-decoration:none;font-size:13px;padding:6px 10px;border-radius:6px;transition:color .15s,background .15s}
+nav a:hover{color:#fff;background:rgba(255,255,255,.08)}
+nav a.active{color:#fff;background:rgba(118,185,0,.25);border-bottom:2px solid var(--green)}
+nav .spacer{flex:1}
+nav select{background:#333;color:#fff;border:1px solid #555;border-radius:6px;padding:4px 8px;font-size:13px;font-family:var(--font)}
+nav .brand{color:#fff;font-weight:700;font-size:14px;margin-right:12px;letter-spacing:.3px}
+
+/* Header / data loading */
+.header{max-width:1200px;margin:0 auto;padding:24px 20px}
+.header h1{font-size:22px;font-weight:700;margin-bottom:4px}
+.header p{color:var(--text2);font-size:14px;margin-bottom:16px}
+.drop-zone{border:2px dashed var(--border);border-radius:var(--radius);padding:40px 20px;text-align:center;transition:border-color .2s,background .2s;cursor:pointer}
+.drop-zone.over{border-color:var(--green);background:rgba(118,185,0,.06)}
+.drop-zone h3{font-size:16px;margin-bottom:8px}
+.drop-zone p{color:var(--text2);font-size:13px}
+.load-row{display:flex;gap:12px;margin-top:12px;align-items:center;flex-wrap:wrap}
+.load-row input[type=file]{font-size:13px}
+.load-row textarea{flex:1;min-width:200px;height:60px;font-family:var(--mono);font-size:12px;padding:8px;border:1px solid var(--border);border-radius:8px;resize:vertical}
+.btn{display:inline-flex;align-items:center;gap:4px;padding:6px 14px;font-size:13px;font-family:var(--font);border:none;border-radius:8px;cursor:pointer;transition:background .15s}
+.btn-green{background:var(--green);color:#fff}.btn-green:hover{background:#6aa600}
+.btn-sm{padding:4px 10px;font-size:12px;border-radius:6px}
+.loaded-bar{display:flex;align-items:center;gap:12px;padding:10px 16px;background:var(--card);border:1px solid var(--border);border-radius:10px}
+.loaded-bar .fname{font-family:var(--mono);font-size:13px;color:var(--green)}
+.loaded-bar .count{font-size:13px;color:var(--text2)}
+
+/* Main */
+main{max-width:1200px;margin:0 auto;padding:0 20px 40px;display:none}
+section{margin-bottom:36px}
+section h2{font-size:18px;font-weight:700;margin-bottom:4px}
+section .desc{color:var(--text2);font-size:13px;margin-bottom:14px}
+.card{background:var(--card);border:1px solid var(--border);border-radius:var(--radius);padding:20px;transition:box-shadow .2s}
+.card:hover{box-shadow:0 4px 16px rgba(0,0,0,.06)}
+
+/* KPI cards */
+.kpi-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(220px,1fr));gap:14px}
+.kpi{border-left:3px solid var(--green);padding:14px 16px}
+.kpi.blue{border-left-color:var(--blue)}.kpi.amber{border-left-color:var(--orange)}
+.kpi .label{font-size:12px;color:var(--text2);text-transform:uppercase;letter-spacing:.5px}
+.kpi .value{font-size:26px;font-weight:700;margin:2px 0}
+.kpi .sub{font-size:12px;color:var(--text2)}
+
+/* Pills */
+.pills{display:flex;flex-wrap:wrap;gap:6px;margin-bottom:12px}
+.pill{padding:4px 12px;font-size:12px;border:1px solid var(--border);border-radius:20px;cursor:pointer;background:var(--card);transition:all .15s;font-family:var(--font)}
+.pill.active{background:var(--green);color:#fff;border-color:var(--green)}
+
+/* Chart container */
+.chart-wrap{overflow-x:auto}
+.chart-wrap svg{display:block}
+.chart-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(520px,1fr));gap:16px}
+
+/* Legend */
+.legend{display:flex;flex-wrap:wrap;gap:10px;margin-top:8px;font-size:12px}
+.legend-item{display:flex;align-items:center;gap:4px}
+.legend-swatch{width:14px;height:4px;border-radius:2px}
+
+/* Table */
+.table-wrap{overflow-x:auto;max-height:500px;overflow-y:auto}
+table{width:100%;border-collapse:collapse;font-size:13px}
+th,td{padding:6px 10px;text-align:left;border-bottom:1px solid var(--border);white-space:nowrap}
+th{position:sticky;top:0;background:var(--card);cursor:pointer;user-select:none;font-weight:600;font-size:12px;text-transform:uppercase;letter-spacing:.3px;color:var(--text2)}
+th:hover{color:var(--text)}
+th .sort-arrow{font-size:10px;margin-left:2px}
+td{font-family:var(--mono);font-size:12px}
+tr:hover td{background:rgba(118,185,0,.04)}
+
+/* Tooltip */
+#tooltip{position:fixed;z-index:200;pointer-events:none;background:rgba(26,26,26,.92);color:#fff;font-size:12px;font-family:var(--mono);padding:8px 12px;border-radius:8px;line-height:1.6;max-width:300px;display:none;backdrop-filter:blur(4px)}
+
+/* Footer */
+footer{text-align:center;padding:20px;font-size:12px;color:#999}
+
+/* Responsive */
+@media(max-width:600px){
+  .chart-grid{grid-template-columns:1fr}
+  .kpi-grid{grid-template-columns:repeat(auto-fill,minmax(160px,1fr))}
+}
+</style>
+</head>
+<body>
+
+<nav id="nav">
+  <span class="brand">KVBM Bench</span>
+  <a href="#summary" data-sec="summary">Summary</a>
+  <a href="#concurrency" data-sec="concurrency">Concurrency</a>
+  <a href="#pagesize" data-sec="pagesize">Page Size</a>
+  <a href="#bounce" data-sec="bounce">Bounce</a>
+  <a href="#g2g3" data-sec="g2g3">G2/G3</a>
+  <a href="#gds" data-sec="gds">GDS vs Staged</a>
+  <a href="#bidir" data-sec="bidir">Bidirectional</a>
+  <a href="#latency" data-sec="latency">Latency</a>
+  <a href="#rawdata" data-sec="rawdata">Data</a>
+  <span class="spacer"></span>
+  <select id="gpuFilter"><option value="all">All GPUs</option></select>
+</nav>
+
+<div class="header" id="headerZone">
+  <h1>KVBM Benchmark Results Viewer</h1>
+  <p>Load bench_engine JSON Lines output to visualize performance characteristics</p>
+  <div class="drop-zone" id="dropZone">
+    <h3>Drop .jsonl file here</h3>
+    <p>or use the controls below</p>
+  </div>
+  <div class="load-row">
+    <input type="file" id="filePicker" accept=".jsonl,.json,.txt">
+    <textarea id="pasteArea" placeholder="Paste JSON Lines here..."></textarea>
+    <button class="btn btn-green" id="pasteBtn">Load</button>
+  </div>
+  <div class="loaded-bar" id="loadedBar" style="display:none">
+    <span class="fname" id="loadedName">file.jsonl</span>
+    <span class="count" id="loadedCount">0 records</span>
+    <span class="spacer" style="flex:1"></span>
+    <button class="btn btn-sm btn-green" id="reloadBtn">Load Different</button>
+  </div>
+</div>
+
+<main id="main">
+  <section id="summary"><h2>Summary</h2><p class="desc">Key performance indicators across all tests</p><div class="kpi-grid" id="kpiGrid"></div></section>
+  <section id="concurrency"><h2>Concurrency Scaling</h2><p class="desc">Bandwidth vs concurrency &mdash; find the saturation point for each transfer type</p><div class="pills" id="concPills"></div><div class="chart-grid" id="concCharts"></div></section>
+  <section id="pagesize"><h2>Page Size Efficiency</h2><p class="desc">Bandwidth vs page_size &mdash; block size amortization</p><div class="pills" id="psPills"></div><div class="chart-wrap" id="psChart"></div></section>
+  <section id="bounce"><h2>Bounce Buffer Analysis</h2><p class="desc">Bandwidth vs bounce_blocks &mdash; double-buffering effectiveness</p><div class="chart-wrap" id="bounceChart"></div></section>
+  <section id="g2g3"><h2>G2/G3 Raw Bandwidth</h2><p class="desc">NVMe read vs write bandwidth at each concurrency level</p><div class="chart-wrap" id="g2g3Chart"></div></section>
+  <section id="gds"><h2>GDS vs Staged</h2><p class="desc">GPUDirect Storage bypass compared to best staged transfer</p><div class="chart-wrap" id="gdsChart"></div></section>
+  <section id="bidir"><h2>Bidirectional Contention</h2><p class="desc">Isolated vs contended bandwidth to measure contention degradation</p><div class="chart-wrap" id="bidirChart"></div></section>
+  <section id="latency"><h2>Latency Distribution</h2><p class="desc">Horizontal box plots showing min / p50 / p95 / p99 / max per test</p><div class="chart-wrap" id="latChart"></div></section>
+  <section id="rawdata"><h2>Raw Data</h2><p class="desc">All loaded records &mdash; click column headers to sort</p><div class="card"><div class="table-wrap" id="tableWrap"></div></div></section>
+</main>
+
+<div id="tooltip"></div>
+<footer>KVBM Bench Viewer &mdash; NVIDIA CORPORATION &amp; AFFILIATES</footer>
+
+<script>
+"use strict";
+
+/* ── Color map ── */
+const COLOR={
+  g1_to_g2:"#76B900",g2_to_g1:"#4A7A00",g2_to_g3:"#0074DF",g3_to_g2:"#1565C0",
+  g1_to_g3_staged:"#EF9100",g3_to_g1_staged:"#C67800",
+  g1_to_g3_gds:"#E52020",g3_to_g1_gds:"#C62828",
+  bidir_g1_to_g2:"#76B900",bidir_g2_to_g1:"#4A7A00"
+};
+const DASHED=new Set(["bidir_g1_to_g2","bidir_g2_to_g1"]);
+function testColor(t){return COLOR[t]||"#888"}
+
+/* ── State ── */
+let DATA=[];
+let FILTERED=[];
+let sortCol=null,sortAsc=true;
+
+/* ── DOM helpers ── */
+const $=s=>document.querySelector(s);
+const $$=s=>[...document.querySelectorAll(s)];
+function el(tag,attrs,children){
+  const e=document.createElement(tag);
+  if(attrs)for(const[k,v]of Object.entries(attrs)){
+    if(k==="style"&&typeof v==="object")Object.assign(e.style,v);
+    else if(k.startsWith("on"))e.addEventListener(k.slice(2),v);
+    else e.setAttribute(k,v);
+  }
+  if(children){
+    if(typeof children==="string")e.textContent=children;
+    else if(Array.isArray(children))children.forEach(c=>{if(c)e.appendChild(typeof c==="string"?document.createTextNode(c):c)});
+    else e.appendChild(children);
+  }
+  return e;
+}
+
+/* ── SVG mini-lib ── */
+const SVG_NS="http://www.w3.org/2000/svg";
+function svgEl(tag,attrs){
+  const e=document.createElementNS(SVG_NS,tag);
+  if(attrs)for(const[k,v]of Object.entries(attrs))e.setAttribute(k,v);
+  return e;
+}
+function createSVG(w,h){
+  const s=svgEl("svg",{width:w,height:h,viewBox:`0 0 ${w} ${h}`});
+  s.style.overflow="visible";
+  return s;
+}
+function scaleLinear(domain,range){
+  const[d0,d1]=domain,[r0,r1]=range;
+  const m=(d1===d0)?0:(r1-r0)/(d1-d0);
+  function fn(v){return r0+m*(v-d0)}
+  fn.domain=domain;fn.range=range;
+  fn.ticks=function(n=6){
+    if(d0===d1)return[d0];
+    const step=niceStep((d1-d0)/n);
+    const start=Math.ceil(d0/step)*step;
+    const arr=[];
+    for(let v=start;v<=d1+step*0.001;v+=step)arr.push(+v.toPrecision(12));
+    return arr;
+  };
+  fn.invert=function(px){return d0===d1?d0:d0+(px-r0)/m};
+  return fn;
+}
+function niceStep(raw){
+  const mag=Math.pow(10,Math.floor(Math.log10(raw)));
+  const res=raw/mag;
+  if(res<=1)return mag;if(res<=2)return 2*mag;if(res<=5)return 5*mag;return 10*mag;
+}
+
+/* Margins for charts */
+const M={top:28,right:20,bottom:44,left:60};
+
+function drawXAxis(svg,scale,y,label,fmt){
+  const ticks=scale.ticks(7);
+  const g=svgEl("g");
+  g.appendChild(svgEl("line",{x1:scale.range[0],x2:scale.range[1],y1:y,y2:y,stroke:"#ddd","stroke-width":1}));
+  ticks.forEach(t=>{
+    const x=scale(t);
+    g.appendChild(svgEl("line",{x1:x,x2:x,y1:y,y2:y+5,stroke:"#bbb","stroke-width":1}));
+    const txt=svgEl("text",{x,y:y+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    txt.textContent=fmt?fmt(t):fmtNum(t);
+    g.appendChild(txt);
+  });
+  if(label){
+    const lbl=svgEl("text",{x:(scale.range[0]+scale.range[1])/2,y:y+36,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
+    lbl.textContent=label;g.appendChild(lbl);
+  }
+  svg.appendChild(g);
+}
+function drawYAxis(svg,scale,x,label){
+  const ticks=scale.ticks(6);
+  const g=svgEl("g");
+  g.appendChild(svgEl("line",{x1:x,x2:x,y1:scale.range[0],y2:scale.range[1],stroke:"#ddd","stroke-width":1}));
+  ticks.forEach(t=>{
+    const y=scale(t);
+    g.appendChild(svgEl("line",{x1:x-5,x2:x,y1:y,y2:y,stroke:"#bbb","stroke-width":1}));
+    /* grid line */
+    g.appendChild(svgEl("line",{x1:x,x2:x+scale.range[0]-x+400,y1:y,y2:y,stroke:"#f0f0f0","stroke-width":1}));
+    const txt=svgEl("text",{x:x-8,y:y+4,"text-anchor":"end",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    txt.textContent=fmtNum(t);g.appendChild(txt);
+  });
+  if(label){
+    const lbl=svgEl("text",{x:0,y:0,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)",transform:`translate(${x-44},${(scale.range[0]+scale.range[1])/2}) rotate(-90)`});
+    lbl.textContent=label;g.appendChild(lbl);
+  }
+  svg.appendChild(g);
+}
+function drawGridY(svg,scaleY,x0,x1){
+  const ticks=scaleY.ticks(6);
+  ticks.forEach(t=>{
+    const y=scaleY(t);
+    svg.appendChild(svgEl("line",{x1:x0,x2:x1,y1:y,y2:y,stroke:"#f0f0f0","stroke-width":1}));
+  });
+}
+
+function drawLine(svg,points,color,dashed){
+  if(points.length===0)return;
+  const d=points.map((p,i)=>(i===0?"M":"L")+p[0]+","+p[1]).join(" ");
+  const attrs={d,fill:"none",stroke:color,"stroke-width":2,"stroke-linejoin":"round","stroke-linecap":"round"};
+  if(dashed)attrs["stroke-dasharray"]="6,4";
+  svg.appendChild(svgEl("path",attrs));
+  points.forEach(p=>{
+    const c=svgEl("circle",{cx:p[0],cy:p[1],r:4,fill:color,stroke:"#fff","stroke-width":1.5,style:"cursor:pointer"});
+    c._tip=p[2];
+    c.addEventListener("mouseenter",e=>showTip(e,p[2]));
+    c.addEventListener("mouseleave",hideTip);
+    svg.appendChild(c);
+  });
+}
+
+/* Tooltip */
+const tooltip=document.getElementById("tooltip");
+function showTip(e,html){
+  tooltip.innerHTML=html;tooltip.style.display="block";
+  positionTip(e);
+}
+function positionTip(e){
+  const tw=tooltip.offsetWidth,th=tooltip.offsetHeight;
+  let x=e.clientX+12,y=e.clientY-th-8;
+  if(x+tw>window.innerWidth)x=e.clientX-tw-12;
+  if(y<0)y=e.clientY+16;
+  tooltip.style.left=x+"px";tooltip.style.top=y+"px";
+}
+function hideTip(){tooltip.style.display="none"}
+document.addEventListener("mousemove",e=>{if(tooltip.style.display==="block")positionTip(e)});
+
+function fmtNum(v){
+  if(v>=1000)return v.toLocaleString(undefined,{maximumFractionDigits:1});
+  if(v===0)return"0";
+  if(Math.abs(v)<0.01)return v.toExponential(1);
+  return +v.toFixed(2)+"";
+}
+function fmtBW(v){return v.toFixed(2)+" GB/s"}
+function fmtUs(v){return v.toFixed(1)+" us"}
+function fmtLabel(t){return t.replace(/_/g," ")}
+
+/* ── Data loading ── */
+function parseJSONL(text){
+  const lines=text.trim().split("\n");
+  const out=[];
+  for(const line of lines){
+    const s=line.trim();
+    if(!s)continue;
+    try{out.push(JSON.parse(s))}catch(e){/* skip */}
+  }
+  return out;
+}
+
+function loadData(records,name){
+  DATA=records;
+  populateGPUFilter();
+  applyFilter();
+  $("#main").style.display="block";
+  /* collapse loader */
+  $("#dropZone").style.display="none";
+  $(".load-row").style.display="none";
+  $("#loadedBar").style.display="flex";
+  $("#loadedName").textContent=name||"pasted data";
+  $("#loadedCount").textContent=records.length+" records";
+}
+
+/* Drop zone */
+const dz=$("#dropZone");
+dz.addEventListener("dragover",e=>{e.preventDefault();dz.classList.add("over")});
+dz.addEventListener("dragleave",()=>dz.classList.remove("over"));
+dz.addEventListener("drop",e=>{
+  e.preventDefault();dz.classList.remove("over");
+  const f=e.dataTransfer.files[0];
+  if(f)readFile(f);
+});
+dz.addEventListener("click",()=>$("#filePicker").click());
+
+$("#filePicker").addEventListener("change",e=>{
+  const f=e.target.files[0];
+  if(f)readFile(f);
+});
+function readFile(f){
+  const r=new FileReader();
+  r.onload=()=>loadData(parseJSONL(r.result),f.name);
+  r.readAsText(f);
+}
+$("#pasteBtn").addEventListener("click",()=>{
+  const txt=$("#pasteArea").value;
+  if(txt.trim())loadData(parseJSONL(txt),"pasted");
+});
+$("#reloadBtn").addEventListener("click",()=>{
+  $("#dropZone").style.display="";$(".load-row").style.display="";
+  $("#loadedBar").style.display="none";
+});
+
+/* GPU filter */
+function populateGPUFilter(){
+  const sel=$("#gpuFilter");
+  sel.innerHTML='<option value="all">All GPUs</option>';
+  const ids=[...new Set(DATA.map(d=>d.device_id))].sort((a,b)=>a-b);
+  ids.forEach(id=>{
+    const o=document.createElement("option");o.value=id;o.textContent="GPU "+id;
+    sel.appendChild(o);
+  });
+}
+$("#gpuFilter").addEventListener("change",()=>applyFilter());
+
+function applyFilter(){
+  const sel=$("#gpuFilter").value;
+  FILTERED=sel==="all"?DATA.slice():DATA.filter(d=>String(d.device_id)===sel);
+  renderAll();
+}
+
+/* ── Render orchestrator ── */
+function renderAll(){
+  renderKPIs();
+  renderConcurrency();
+  renderPageSize();
+  renderBounce();
+  renderG2G3();
+  renderGDS();
+  renderBidir();
+  renderLatency();
+  renderTable();
+  hideEmptySections();
+}
+
+/* ── Helpers ── */
+function unique(arr,key){return[...new Set(arr.map(d=>d[key]))].sort((a,b)=>a-b)}
+function groupBy(arr,key){
+  const m={};
+  arr.forEach(d=>{const k=d[key];(m[k]=m[k]||[]).push(d)});
+  return m;
+}
+
+/* ── Section: KPIs ── */
+function renderKPIs(){
+  const g=$("#kpiGrid");g.innerHTML="";
+  if(!FILTERED.length)return;
+  const maxBW=Math.max(...FILTERED.map(d=>d.bandwidth_gbs||0));
+  const maxAgg=Math.max(...FILTERED.map(d=>d.aggregate_bandwidth_gbs||0));
+  const tests=[...new Set(FILTERED.map(d=>d.test))];
+  const devices=[...new Set(FILTERED.map(d=>d.device_id))];
+  const avgLat=FILTERED.reduce((s,d)=>s+(d.latency_us?.mean_us||0),0)/FILTERED.length;
+
+  const kpis=[
+    {label:"Peak Single-Stream BW",value:fmtBW(maxBW),sub:FILTERED.find(d=>d.bandwidth_gbs===maxBW)?.test,cls:""},
+    {label:"Peak Aggregate BW",value:fmtBW(maxAgg),sub:"across all streams",cls:""},
+    {label:"Tests",value:tests.length,sub:tests.slice(0,3).join(", ")+(tests.length>3?" ...":""),cls:"blue"},
+    {label:"GPUs",value:devices.length,sub:"device IDs: "+devices.join(", "),cls:"blue"},
+    {label:"Total Records",value:FILTERED.length,sub:"loaded",cls:"amber"},
+    {label:"Avg Latency",value:fmtUs(avgLat),sub:"mean across all tests",cls:"amber"},
+  ];
+  kpis.forEach(k=>{
+    const d=el("div",{class:"card kpi"+(k.cls?" "+k.cls:"")});
+    d.appendChild(el("div",{class:"label"},k.label));
+    d.appendChild(el("div",{class:"value"},String(k.value)));
+    d.appendChild(el("div",{class:"sub"},k.sub||""));
+    g.appendChild(d);
+  });
+}
+
+/* ── Section: Concurrency Scaling ── */
+let concPageFilter=null;
+function renderConcurrency(){
+  const wrap=$("#concCharts");wrap.innerHTML="";
+  const pillsEl=$("#concPills");pillsEl.innerHTML="";
+  const pages=unique(FILTERED,"page_size");
+  if(pages.length>1){
+    pages.forEach(ps=>{
+      const p=el("button",{class:"pill"+(concPageFilter===ps?" active":""),onclick:()=>{concPageFilter=concPageFilter===ps?null:ps;renderConcurrency()}},String(ps));
+      pillsEl.appendChild(p);
+    });
+  }
+  let data=FILTERED;
+  if(concPageFilter!==null)data=data.filter(d=>d.page_size===concPageFilter);
+
+  const byTest=groupBy(data,"test");
+  const tests=Object.keys(byTest);
+  if(!tests.length)return;
+
+  const W=540,H=300;
+  const svg=createSVG(W,H);
+  const xVals=unique(data,"concurrency");
+  const allBW=data.map(d=>d.aggregate_bandwidth_gbs||d.bandwidth_gbs||0);
+  const xS=scaleLinear([Math.min(...xVals),Math.max(...xVals)],[M.left,W-M.right]);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.1],[H-M.bottom,M.top]);
+
+  drawGridY(svg,yS,M.left,W-M.right);
+  drawXAxis(svg,xS,H-M.bottom,"Concurrency");
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  tests.forEach(t=>{
+    const pts=byTest[t].slice().sort((a,b)=>a.concurrency-b.concurrency);
+    const mapped=pts.map(d=>[xS(d.concurrency),yS(d.aggregate_bandwidth_gbs||d.bandwidth_gbs),
+      `<b>${fmtLabel(t)}</b><br>conc: ${d.concurrency}<br>BW: ${fmtBW(d.aggregate_bandwidth_gbs||d.bandwidth_gbs)}<br>page: ${d.page_size}`]);
+    drawLine(svg,mapped,testColor(t),DASHED.has(t));
+  });
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);
+  card.appendChild(makeLegend(tests));
+  wrap.appendChild(card);
+}
+
+function makeLegend(tests){
+  const lg=el("div",{class:"legend"});
+  tests.forEach(t=>{
+    const item=el("div",{class:"legend-item"});
+    const sw=el("span",{class:"legend-swatch",style:{background:testColor(t),width:"14px",height:DASHED.has(t)?"2px":"4px",borderTop:DASHED.has(t)?`2px dashed ${testColor(t)}`:"none",background:DASHED.has(t)?"transparent":testColor(t)}});
+    item.appendChild(sw);
+    item.appendChild(el("span",{},fmtLabel(t)));
+    lg.appendChild(item);
+  });
+  return lg;
+}
+
+/* ── Section: Page Size ── */
+let psConcFilter=null;
+function renderPageSize(){
+  const wrap=$("#psChart");wrap.innerHTML="";
+  const pillsEl=$("#psPills");pillsEl.innerHTML="";
+  const concs=unique(FILTERED,"concurrency");
+  if(concs.length>1){
+    concs.forEach(c=>{
+      const p=el("button",{class:"pill"+(psConcFilter===c?" active":""),onclick:()=>{psConcFilter=psConcFilter===c?null:c;renderPageSize()}},String(c));
+      pillsEl.appendChild(p);
+    });
+  }
+  let data=FILTERED;
+  if(psConcFilter!==null)data=data.filter(d=>d.concurrency===psConcFilter);
+
+  const byTest=groupBy(data,"test");
+  const tests=Object.keys(byTest);
+  const pages=unique(data,"page_size");
+  if(!tests.length||!pages.length)return;
+
+  const barW=20,groupGap=24,innerGap=2;
+  const groupWidth=tests.length*barW+(tests.length-1)*innerGap;
+  const W=M.left+M.right+pages.length*(groupWidth+groupGap);
+  const H=320;
+  const svg=createSVG(Math.max(W,400),H);
+
+  const allBW=data.map(d=>d.bandwidth_gbs||0);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
+  drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  pages.forEach((ps,pi)=>{
+    const gx=M.left+pi*(groupWidth+groupGap)+groupGap/2;
+    /* x-label */
+    const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    lbl.textContent=ps;svg.appendChild(lbl);
+
+    tests.forEach((t,ti)=>{
+      const rec=byTest[t]?.find(d=>d.page_size===ps);
+      if(!rec)return;
+      const bw=rec.bandwidth_gbs||0;
+      const bx=gx+ti*(barW+innerGap);
+      const by=yS(bw);
+      const bh=yS(0)-by;
+      const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(bh,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
+      rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>page_size: ${ps}<br>BW: ${fmtBW(bw)}`));
+      rect.addEventListener("mouseleave",hideTip);
+      svg.appendChild(rect);
+    });
+  });
+  /* x-axis label */
+  const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
+  xLbl.textContent="page_size";svg.appendChild(xLbl);
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);card.appendChild(makeLegend(tests));
+  wrap.appendChild(card);
+}
+
+/* ── Section: Bounce Buffer ── */
+function renderBounce(){
+  const wrap=$("#bounceChart");wrap.innerHTML="";
+  const data=FILTERED.filter(d=>d.bounce_blocks!=null);
+  if(!data.length){return}
+
+  const byTest=groupBy(data,"test");
+  const tests=Object.keys(byTest);
+  const W=540,H=300;
+  const svg=createSVG(W,H);
+
+  const xVals=unique(data,"bounce_blocks");
+  const allBW=data.map(d=>d.bandwidth_gbs||0);
+  const xS=scaleLinear([Math.min(...xVals),Math.max(...xVals)],[M.left,W-M.right]);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.1],[H-M.bottom,M.top]);
+
+  drawGridY(svg,yS,M.left,W-M.right);
+  drawXAxis(svg,xS,H-M.bottom,"bounce_blocks");
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  tests.forEach(t=>{
+    const pts=byTest[t].slice().sort((a,b)=>a.bounce_blocks-b.bounce_blocks);
+    const mapped=pts.map(d=>[xS(d.bounce_blocks),yS(d.bandwidth_gbs),
+      `<b>${fmtLabel(t)}</b><br>bounce: ${d.bounce_blocks}<br>BW: ${fmtBW(d.bandwidth_gbs)}`]);
+    drawLine(svg,mapped,testColor(t),DASHED.has(t));
+  });
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);card.appendChild(makeLegend(tests));
+  wrap.appendChild(card);
+}
+
+/* ── Section: G2/G3 ── */
+function renderG2G3(){
+  const wrap=$("#g2g3Chart");wrap.innerHTML="";
+  const tests=["g2_to_g3","g3_to_g2"];
+  const data=FILTERED.filter(d=>tests.includes(d.test));
+  if(!data.length)return;
+
+  const concs=unique(data,"concurrency");
+  const byTest=groupBy(data,"test");
+  const barW=24,innerGap=4,groupGap=30;
+  const groupWidth=tests.length*barW+(tests.length-1)*innerGap;
+  const W=M.left+M.right+concs.length*(groupWidth+groupGap);
+  const H=320;
+  const svg=createSVG(Math.max(W,400),H);
+
+  const allBW=data.map(d=>d.bandwidth_gbs||0);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
+  drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  concs.forEach((c,ci)=>{
+    const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
+    const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    lbl.textContent=c;svg.appendChild(lbl);
+
+    tests.forEach((t,ti)=>{
+      const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
+      if(!rec)return;
+      const bw=rec.bandwidth_gbs||0;
+      const bx=gx+ti*(barW+innerGap);
+      const by=yS(bw);
+      const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
+      rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
+      rect.addEventListener("mouseleave",hideTip);
+      svg.appendChild(rect);
+    });
+  });
+  const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
+  xLbl.textContent="Concurrency";svg.appendChild(xLbl);
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);card.appendChild(makeLegend(tests));
+  wrap.appendChild(card);
+}
+
+/* ── Section: GDS vs Staged ── */
+function renderGDS(){
+  const wrap=$("#gdsChart");wrap.innerHTML="";
+  const gdsTests=["g1_to_g3_gds","g3_to_g1_gds"];
+  const stagedTests=["g1_to_g3_staged","g3_to_g1_staged"];
+  const allTests=[...gdsTests,...stagedTests];
+  const data=FILTERED.filter(d=>allTests.includes(d.test));
+  if(!data.filter(d=>gdsTests.includes(d.test)).length)return;
+
+  const concs=unique(data,"concurrency");
+  const byTest=groupBy(data,"test");
+  const present=allTests.filter(t=>byTest[t]?.length);
+  const barW=22,innerGap=3,groupGap=28;
+  const groupWidth=present.length*barW+(present.length-1)*innerGap;
+  const W=M.left+M.right+concs.length*(groupWidth+groupGap);
+  const H=320;
+  const svg=createSVG(Math.max(W,400),H);
+
+  const allBW=data.map(d=>d.bandwidth_gbs||0);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
+  drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  concs.forEach((c,ci)=>{
+    const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
+    const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    lbl.textContent=c;svg.appendChild(lbl);
+
+    present.forEach((t,ti)=>{
+      const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
+      if(!rec)return;
+      const bw=rec.bandwidth_gbs||0;
+      const bx=gx+ti*(barW+innerGap);
+      const by=yS(bw);
+      const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),style:"cursor:pointer"});
+      rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
+      rect.addEventListener("mouseleave",hideTip);
+      svg.appendChild(rect);
+    });
+  });
+  const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
+  xLbl.textContent="Concurrency";svg.appendChild(xLbl);
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);card.appendChild(makeLegend(present));
+  wrap.appendChild(card);
+}
+
+/* ── Section: Bidirectional ── */
+function renderBidir(){
+  const wrap=$("#bidirChart");wrap.innerHTML="";
+  const bidirTests=["bidir_g1_to_g2","bidir_g2_to_g1"];
+  const isoTests=["g1_to_g2","g2_to_g1"];
+  const data=FILTERED.filter(d=>[...bidirTests,...isoTests].includes(d.test));
+  if(!data.filter(d=>bidirTests.includes(d.test)).length)return;
+
+  const concs=unique(data,"concurrency");
+  const byTest=groupBy(data,"test");
+  const present=[...isoTests,...bidirTests].filter(t=>byTest[t]?.length);
+  const barW=22,innerGap=3,groupGap=28;
+  const groupWidth=present.length*barW+(present.length-1)*innerGap;
+  const W=M.left+M.right+concs.length*(groupWidth+groupGap);
+  const H=320;
+  const svg=createSVG(Math.max(W,400),H);
+
+  const allBW=data.map(d=>d.bandwidth_gbs||0);
+  const yS=scaleLinear([0,Math.max(...allBW)*1.15],[H-M.bottom,M.top]);
+  drawGridY(svg,yS,M.left,Math.max(W,400)-M.right);
+  drawYAxis(svg,yS,M.left,"BW (GB/s)");
+
+  concs.forEach((c,ci)=>{
+    const gx=M.left+ci*(groupWidth+groupGap)+groupGap/2;
+    const lbl=svgEl("text",{x:gx+groupWidth/2,y:H-M.bottom+18,"text-anchor":"middle",fill:"#777","font-size":"11",style:"font-family:var(--font)"});
+    lbl.textContent=c;svg.appendChild(lbl);
+
+    present.forEach((t,ti)=>{
+      const rec=(byTest[t]||[]).find(d=>d.concurrency===c);
+      if(!rec)return;
+      const bw=rec.bandwidth_gbs||0;
+      const bx=gx+ti*(barW+innerGap);
+      const by=yS(bw);
+      const hatch=DASHED.has(t);
+      const rect=svgEl("rect",{x:bx,y:by,width:barW,height:Math.max(yS(0)-by,1),rx:3,fill:testColor(t),opacity:hatch?"0.6":"1",style:"cursor:pointer"});
+      if(hatch){
+        rect.setAttribute("stroke",testColor(t));
+        rect.setAttribute("stroke-width","2");
+        rect.setAttribute("stroke-dasharray","4,3");
+      }
+      rect.addEventListener("mouseenter",e=>showTip(e,`<b>${fmtLabel(t)}</b><br>conc: ${c}<br>BW: ${fmtBW(bw)}`));
+      rect.addEventListener("mouseleave",hideTip);
+      svg.appendChild(rect);
+    });
+  });
+  const xLbl=svgEl("text",{x:(M.left+Math.max(W,400)-M.right)/2,y:H-4,"text-anchor":"middle",fill:"#999","font-size":"11",style:"font-family:var(--font)"});
+  xLbl.textContent="Concurrency";svg.appendChild(xLbl);
+
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);card.appendChild(makeLegend(present));
+  wrap.appendChild(card);
+}
+
+/* ── Section: Latency Distribution ── */
+function renderLatency(){
+  const wrap=$("#latChart");wrap.innerHTML="";
+  const data=FILTERED.filter(d=>d.latency_us);
+  if(!data.length)return;
+
+  /* Aggregate per test: take the record with max mean_us per test for interesting spread,
+     or just show all unique test entries. We'll pick one representative per (test, concurrency, page_size). */
+  const byTest=groupBy(data,"test");
+  const tests=Object.keys(byTest);
+  const rowH=36,padY=8;
+  const W=700,H=M.top+tests.length*(rowH+padY)+M.bottom;
+  const svg=createSVG(W,H);
+
+  /* Find global latency range */
+  let gMin=Infinity,gMax=0;
+  tests.forEach(t=>{
+    byTest[t].forEach(d=>{
+      const l=d.latency_us;
+      if(l.min_us<gMin)gMin=l.min_us;
+      if(l.max_us>gMax)gMax=l.max_us;
+    });
+  });
+  const xS=scaleLinear([0,gMax*1.05],[M.left+100,W-M.right]);
+
+  /* x-axis */
+  drawXAxis(svg,xS,H-M.bottom,"Latency (us)");
+
+  tests.forEach((t,i)=>{
+    /* Use the record closest to median concurrency */
+    const recs=byTest[t].slice().sort((a,b)=>(a.latency_us.mean_us)-(b.latency_us.mean_us));
+    const mid=recs[Math.floor(recs.length/2)];
+    const l=mid.latency_us;
+    const cy=M.top+i*(rowH+padY)+rowH/2;
+    const color=testColor(t);
+
+    /* label */
+    const lbl=svgEl("text",{x:M.left+94,y:cy+4,"text-anchor":"end",fill:color,"font-size":"12","font-weight":"600",style:"font-family:var(--font)"});
+    lbl.textContent=fmtLabel(t);svg.appendChild(lbl);
+
+    /* whisker: min to max */
+    svg.appendChild(svgEl("line",{x1:xS(l.min_us),x2:xS(l.max_us),y1:cy,y2:cy,stroke:color,"stroke-width":1}));
+    /* caps */
+    svg.appendChild(svgEl("line",{x1:xS(l.min_us),x2:xS(l.min_us),y1:cy-8,y2:cy+8,stroke:color,"stroke-width":1.5}));
+    svg.appendChild(svgEl("line",{x1:xS(l.max_us),x2:xS(l.max_us),y1:cy-8,y2:cy+8,stroke:color,"stroke-width":1.5}));
+
+    /* box: p50 to p95 */
+    const bx=xS(l.p50_us),bx2=xS(l.p95_us);
+    const boxRect=svgEl("rect",{x:bx,y:cy-10,width:Math.max(bx2-bx,2),height:20,rx:4,fill:color,opacity:"0.25",stroke:color,"stroke-width":1.5});
+    svg.appendChild(boxRect);
+
+    /* p99 mark */
+    const p99x=xS(l.p99_us);
+    svg.appendChild(svgEl("line",{x1:p99x,x2:p99x,y1:cy-10,y2:cy+10,stroke:color,"stroke-width":2,"stroke-dasharray":"3,2"}));
+
+    /* median line */
+    const medX=xS(l.p50_us);
+    svg.appendChild(svgEl("line",{x1:medX,x2:medX,y1:cy-10,y2:cy+10,stroke:color,"stroke-width":2.5}));
+
+    /* hover target */
+    const hover=svgEl("rect",{x:xS(l.min_us)-2,y:cy-12,width:xS(l.max_us)-xS(l.min_us)+4,height:24,fill:"transparent",style:"cursor:pointer"});
+    hover.addEventListener("mouseenter",e=>showTip(e,
+      `<b>${fmtLabel(t)}</b> (conc=${mid.concurrency}, page=${mid.page_size})<br>`+
+      `min: ${fmtUs(l.min_us)}<br>p50: ${fmtUs(l.p50_us)}<br>p95: ${fmtUs(l.p95_us)}<br>p99: ${fmtUs(l.p99_us)}<br>max: ${fmtUs(l.max_us)}`));
+    hover.addEventListener("mouseleave",hideTip);
+    svg.appendChild(hover);
+  });
+
+  /* Legend for box plot parts */
+  const card=el("div",{class:"card"});
+  card.appendChild(svg);
+  const lgDiv=el("div",{class:"legend",style:{marginTop:"6px",fontSize:"11px",color:"#999"}});
+  lgDiv.innerHTML="<span>Whiskers: min/max</span> &middot; <span>Box: p50&ndash;p95</span> &middot; <span>Dashed: p99</span> &middot; <span>Bold line: p50</span>";
+  card.appendChild(lgDiv);
+  wrap.appendChild(card);
+}
+
+/* ── Section: Raw Data Table ── */
+const TABLE_COLS=[
+  {key:"test",label:"Test",fmt:v=>v},
+  {key:"device_id",label:"GPU",fmt:v=>v},
+  {key:"page_size",label:"Page Size",fmt:v=>v},
+  {key:"blocks_per_batch",label:"Blocks/Batch",fmt:v=>v},
+  {key:"concurrency",label:"Conc",fmt:v=>v},
+  {key:"bounce_blocks",label:"Bounce",fmt:v=>v==null?"-":v},
+  {key:"bandwidth_gbs",label:"BW (GB/s)",fmt:v=>v?.toFixed(2)??""},
+  {key:"aggregate_bandwidth_gbs",label:"Agg BW",fmt:v=>v?.toFixed(2)??""},
+  {key:"_lat_mean",label:"Lat mean",fmt:v=>v?.toFixed(1)??""},
+  {key:"_lat_p50",label:"Lat p50",fmt:v=>v?.toFixed(1)??""},
+  {key:"_lat_p99",label:"Lat p99",fmt:v=>v?.toFixed(1)??""},
+  {key:"iterations",label:"Iters",fmt:v=>v},
+];
+function getVal(d,key){
+  if(key==="_lat_mean")return d.latency_us?.mean_us;
+  if(key==="_lat_p50")return d.latency_us?.p50_us;
+  if(key==="_lat_p99")return d.latency_us?.p99_us;
+  return d[key];
+}
+function renderTable(){
+  const wrap=$("#tableWrap");wrap.innerHTML="";
+  if(!FILTERED.length)return;
+
+  let rows=FILTERED.slice();
+  if(sortCol!==null){
+    rows.sort((a,b)=>{
+      let va=getVal(a,sortCol),vb=getVal(b,sortCol);
+      if(va==null)va=sortAsc?Infinity:-Infinity;
+      if(vb==null)vb=sortAsc?Infinity:-Infinity;
+      if(typeof va==="string")return sortAsc?va.localeCompare(vb):vb.localeCompare(va);
+      return sortAsc?(va-vb):(vb-va);
+    });
+  }
+
+  const table=el("table");
+  const thead=el("thead");
+  const tr=el("tr");
+  TABLE_COLS.forEach(c=>{
+    const th=el("th",{onclick:()=>{
+      if(sortCol===c.key)sortAsc=!sortAsc;else{sortCol=c.key;sortAsc=true;}
+      renderTable();
+    }});
+    th.innerHTML=c.label+(sortCol===c.key?` <span class="sort-arrow">${sortAsc?"\u25B2":"\u25BC"}</span>`:"");
+    tr.appendChild(th);
+  });
+  thead.appendChild(tr);table.appendChild(thead);
+
+  const tbody=el("tbody");
+  rows.forEach(d=>{
+    const row=el("tr");
+    TABLE_COLS.forEach(c=>{
+      const td=el("td");
+      const v=getVal(d,c.key);
+      td.textContent=c.fmt(v);
+      if(c.key==="test")td.style.color=testColor(v);
+      row.appendChild(td);
+    });
+    tbody.appendChild(row);
+  });
+  table.appendChild(tbody);
+  wrap.appendChild(table);
+}
+
+/* ── Section auto-hide ── */
+function hideEmptySections(){
+  const checks={
+    bounce:FILTERED.some(d=>d.bounce_blocks!=null),
+    g2g3:FILTERED.some(d=>["g2_to_g3","g3_to_g2"].includes(d.test)),
+    gds:FILTERED.some(d=>["g1_to_g3_gds","g3_to_g1_gds"].includes(d.test)),
+    bidir:FILTERED.some(d=>d.test.startsWith("bidir_")),
+  };
+  for(const[id,show]of Object.entries(checks)){
+    const sec=document.getElementById(id);
+    if(sec)sec.style.display=show?"":"none";
+  }
+}
+
+/* ── Nav IntersectionObserver ── */
+const navLinks=$$("nav a[data-sec]");
+const secEls=navLinks.map(a=>document.getElementById(a.dataset.sec)).filter(Boolean);
+const obsOpts={rootMargin:"-56px 0px -60% 0px",threshold:0};
+const observer=new IntersectionObserver(entries=>{
+  entries.forEach(en=>{
+    if(en.isIntersecting){
+      navLinks.forEach(a=>a.classList.toggle("active",a.dataset.sec===en.target.id));
+    }
+  });
+},obsOpts);
+secEls.forEach(s=>observer.observe(s));
+</script>
+</body>
+</html>
--- a/lib/kvbm-engine/scripts/test-s3.sh
+++ b/lib/kvbm-engine/scripts/test-s3.sh
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Runs S3 integration tests against a local MinIO instance.
+# Usage: bash lib/kvbm-engine/scripts/test-s3.sh
+
+set -euo pipefail
+
+MINIO_CONTAINER="kvbm-minio-test-$$-$RANDOM"
+MINIO_PORT="${MINIO_PORT:-9876}"
+MINIO_ROOT_USER="minioadmin"
+MINIO_ROOT_PASSWORD="minioadmin"
+
+cleanup() {
+    echo "Cleaning up MinIO container: $MINIO_CONTAINER"
+    docker stop "$MINIO_CONTAINER" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "Starting MinIO container: $MINIO_CONTAINER on port $MINIO_PORT"
+docker run --rm -d \
+    --name "$MINIO_CONTAINER" \
+    -p "${MINIO_PORT}:9000" \
+    -e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \
+    -e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \
+    minio/minio:latest server /data
+
+# Wait for MinIO to be ready
+echo "Waiting for MinIO to be ready..."
+for i in $(seq 1 30); do
+    if curl -sf "http://localhost:${MINIO_PORT}/minio/health/live" >/dev/null 2>&1; then
+        echo "MinIO is ready."
+        break
+    fi
+    if [ "$i" -eq 30 ]; then
+        echo "ERROR: MinIO failed to start within 30 seconds"
+        exit 1
+    fi
+    sleep 1
+done
+
+export S3_TEST_ENDPOINT="http://localhost:${MINIO_PORT}"
+export AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}"
+export AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}"
+export AWS_DEFAULT_REGION="us-east-1"
+
+echo "Running S3 integration tests..."
+timeout 120 cargo test -p kvbm-engine --features testing-s3 -- s3_integration
+exit_code=$?
+
+echo "Tests finished with exit code: $exit_code"
+exit $exit_code
--- a/lib/kvbm-engine/src/collectives/bootstrap.rs
+++ b/lib/kvbm-engine/src/collectives/bootstrap.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NCCL bootstrap utilities for creating communicators from scratch.
+//!
+//! This module provides helpers for initializing NCCL communicators in standalone
+//! Rust applications and tests, where no external launcher (like PyTorch) provides
+//! pre-initialized communicators.
+//!
+//! # Two Construction Paths
+//!
+//! NCCL communicators can be created via two paths:
+//!
+//! 1. **Bootstrap (this module)**: For tests and standalone Rust applications.
+//!    Rank 0 generates a unique ID, distributes it to other ranks, and all
+//!    ranks collectively call `ncclCommInitRank`.
+//!
+//! 2. **Borrowed handles**: For production use with PyTorch, vLLM, or TensorRT-LLM.
+//!    The external runtime creates the communicator, and Rust code borrows it
+//!    via FFI. See [`NcclCollectives::from_borrowed`].
+//!
+//! # Example: Multi-process Bootstrap
+//!
+//! ```rust,ignore
+//! use kvbm::v2::distributed::collectives::NcclBootstrap;
+//!
+//! // Rank 0: Generate and share the unique ID
+//! if rank == 0 {
+//!     let bootstrap = NcclBootstrap::generate(world_size)?;
+//!     let bytes = bootstrap.serialize();
+//!     // Send `bytes` to other ranks via your IPC mechanism
+//! }
+//!
+//! // All ranks: Initialize communicator
+//! let bootstrap = if rank == 0 {
+//!     NcclBootstrap::generate(world_size)?
+//! } else {
+//!     let bytes = receive_from_rank_0();
+//!     NcclBootstrap::deserialize(&bytes)?
+//! };
+//!
+//! let comm = bootstrap.init_communicator(rank, stream)?;
+//! ```
+
+use std::ffi::c_char;
+use std::mem::MaybeUninit;
+
+/// Platform-neutral byte type for NCCL's `ncclUniqueId::internal` field.
+/// `c_char` is `i8` on x86_64 and `u8` on aarch64.
+type NcclByte = c_char;
+
+use anyhow::{Context, Result};
+use cudarc::driver::sys::CUstream;
+use cudarc::nccl::sys::{
+    ncclComm_t, ncclCommInitRank, ncclGetUniqueId, ncclResult_t, ncclUniqueId,
+};
+
+/// Bootstrap for creating NCCL communicators from scratch.
+///
+/// Used by tests and standalone Rust applications where NCCL communicators
+/// need to be created without an external launcher.
+///
+/// # Workflow
+///
+/// 1. Rank 0 calls [`NcclBootstrap::generate`] to create the unique ID
+/// 2. Rank 0 serializes via [`NcclBootstrap::serialize`] and sends to other ranks
+/// 3. Other ranks deserialize via [`NcclBootstrap::deserialize`]
+/// 4. All ranks collectively call [`NcclBootstrap::init_communicator`]
+///
+/// # Thread Safety
+///
+/// The bootstrap object itself is not thread-safe, but multiple threads can
+/// each have their own bootstrap object with the same unique ID to initialize
+/// communicators on different devices.
+#[derive(Clone)]
+pub struct NcclBootstrap {
+    nccl_id: ncclUniqueId,
+    world_size: usize,
+}
+
+impl NcclBootstrap {
+    /// Generate a new bootstrap on rank 0.
+    ///
+    /// This creates a unique NCCL ID that must be shared with all other ranks
+    /// before they can initialize their communicators.
+    ///
+    /// # Arguments
+    /// * `world_size` - Total number of ranks in the collective group
+    ///
+    /// # Returns
+    /// A bootstrap object that can be serialized and distributed to other ranks.
+    ///
+    /// # Errors
+    /// Returns an error if NCCL fails to generate a unique ID.
+    pub fn generate(world_size: usize) -> Result<Self> {
+        anyhow::ensure!(
+            world_size > 0 && world_size <= i32::MAX as usize,
+            "world_size must be in 1..={}, got {}",
+            i32::MAX,
+            world_size
+        );
+        let mut nccl_id = MaybeUninit::<ncclUniqueId>::uninit();
+
+        // SAFETY: ncclGetUniqueId initializes the ncclUniqueId struct
+        let result = unsafe { ncclGetUniqueId(nccl_id.as_mut_ptr()) };
+        check_nccl_result(result).context("Failed to generate NCCL unique ID")?;
+
+        // SAFETY: ncclGetUniqueId has initialized the struct
+        let nccl_id = unsafe { nccl_id.assume_init() };
+
+        Ok(Self {
+            nccl_id,
+            world_size,
+        })
+    }
+
+    /// Get the world size for this bootstrap.
+    pub fn world_size(&self) -> usize {
+        self.world_size
+    }
+
+    /// Serialize the bootstrap for transmission to other ranks.
+    ///
+    /// The serialized format is:
+    /// - 8 bytes: world_size as little-endian u64
+    /// - 128 bytes: NCCL unique ID internal data
+    ///
+    /// # Returns
+    /// A byte vector that can be transmitted via any IPC mechanism.
+    pub fn serialize(&self) -> Vec<u8> {
+        let mut bytes = Vec::with_capacity(8 + 128);
+        bytes.extend_from_slice(&(self.world_size as u64).to_le_bytes());
+        // Convert NcclByte array to u8 for serialization
+        for &byte in &self.nccl_id.internal {
+            bytes.push(byte as u8);
+        }
+        bytes
+    }
+
+    /// Deserialize a bootstrap received from rank 0.
+    ///
+    /// # Arguments
+    /// * `bytes` - Serialized bootstrap data from [`NcclBootstrap::serialize`]
+    ///
+    /// # Returns
+    /// A bootstrap object that can be used to initialize a communicator.
+    ///
+    /// # Errors
+    /// Returns an error if the byte array has incorrect length.
+    pub fn deserialize(bytes: &[u8]) -> Result<Self> {
+        if bytes.len() != 8 + 128 {
+            anyhow::bail!(
+                "Invalid bootstrap data length: expected {}, got {}",
+                8 + 128,
+                bytes.len()
+            );
+        }
+
+        let world_size = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as usize;
+
+        let mut nccl_id = ncclUniqueId {
+            internal: [0 as NcclByte; 128],
+        };
+        // Copy bytes into internal array
+        for (i, &byte) in bytes[8..].iter().enumerate() {
+            nccl_id.internal[i] = byte as NcclByte;
+        }
+
+        Ok(Self {
+            nccl_id,
+            world_size,
+        })
+    }
+
+    /// Initialize an NCCL communicator for this rank.
+    ///
+    /// This is a **collective operation** - all ranks must call this method
+    /// simultaneously with the same bootstrap data for initialization to succeed.
+    ///
+    /// # Arguments
+    /// * `rank` - The rank of this worker (0 to world_size-1)
+    /// * `stream` - The CUDA stream to associate with NCCL operations
+    ///
+    /// # Returns
+    /// An NCCL communicator handle that can be used for collective operations.
+    ///
+    /// # Safety
+    /// The returned communicator must be destroyed with `ncclCommDestroy` when
+    /// no longer needed. The caller is responsible for lifetime management.
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - `rank` is >= `world_size`
+    /// - NCCL initialization fails (e.g., network issues, GPU errors)
+    /// - Not all ranks call this method (will hang)
+    pub fn init_communicator(&self, rank: usize, _stream: CUstream) -> Result<ncclComm_t> {
+        if rank >= self.world_size {
+            anyhow::bail!(
+                "Rank {} is invalid for world_size {}",
+                rank,
+                self.world_size
+            );
+        }
+        anyhow::ensure!(
+            self.world_size <= i32::MAX as usize,
+            "world_size {} exceeds i32::MAX",
+            self.world_size
+        );
+
+        let mut comm = MaybeUninit::<ncclComm_t>::uninit();
+
+        // SAFETY: ncclCommInitRank is a collective call that initializes the communicator.
+        // All ranks must call this with the same nccl_id for it to complete.
+        let result = unsafe {
+            ncclCommInitRank(
+                comm.as_mut_ptr(),
+                self.world_size as i32,
+                self.nccl_id,
+                rank as i32,
+            )
+        };
+        check_nccl_result(result).context("Failed to initialize NCCL communicator")?;
+
+        // SAFETY: ncclCommInitRank has initialized the communicator
+        let comm = unsafe { comm.assume_init() };
+
+        tracing::debug!(
+            rank,
+            world_size = self.world_size,
+            "NCCL communicator initialized"
+        );
+
+        Ok(comm)
+    }
+}
+
+/// Check an NCCL result and convert to anyhow::Result.
+pub(crate) fn check_nccl_result(result: ncclResult_t) -> Result<()> {
+    if result == ncclResult_t::ncclSuccess {
+        Ok(())
+    } else {
+        anyhow::bail!("NCCL error: {:?}", result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bootstrap_serialization_roundtrip() {
+        // Note: This test doesn't actually call NCCL functions,
+        // it just tests the serialization logic
+        let world_size = 4;
+
+        // Create a bootstrap with a dummy ID (we can't call ncclGetUniqueId without NCCL)
+        let original = NcclBootstrap {
+            nccl_id: ncclUniqueId {
+                internal: [42 as NcclByte; 128],
+            },
+            world_size,
+        };
+
+        let bytes = original.serialize();
+        assert_eq!(bytes.len(), 8 + 128);
+
+        let deserialized = NcclBootstrap::deserialize(&bytes).unwrap();
+        assert_eq!(deserialized.world_size, world_size);
+        assert_eq!(deserialized.nccl_id.internal, original.nccl_id.internal);
+    }
+
+    #[test]
+    fn test_deserialize_invalid_length() {
+        let bytes = vec![0u8; 10]; // Wrong length
+        let result = NcclBootstrap::deserialize(&bytes);
+        assert!(result.is_err());
+    }
+}
--- a/lib/kvbm-engine/src/collectives/mod.rs
+++ b/lib/kvbm-engine/src/collectives/mod.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Collective communication operations for distributed workers.
+//!
+//! This module provides infrastructure for collective operations needed by
+//! replicated data workers. It defines the [`CollectiveOps`] trait and provides
+//! multiple implementations:
+//!
+//! - [`StubCollectiveOps`]: No-op implementation for testing and single-worker scenarios
+//! - [`NcclCollectives`]: NCCL-based implementation for GPU collective operations (requires `nccl` feature)
+//!
+//! # Architecture
+//!
+//! In MLA (Multi-head Latent Attention) scenarios, KV blocks are replicated across
+//! all workers rather than sharded. This means only rank 0 needs G2/G3 storage -
+//! other ranks receive data via broadcast from rank 0 after it loads from G2/G3.
+//!
+//! ```text
+//! Rank 0:   G3 (disk) ←→ G2 (host) ←→ G1 (GPU) ───broadcast──→ Other ranks G1
+//! Rank 1-N: [no G2/G3]                G1 (GPU) ←──────────────────────┘
+//! ```
+//!
+//! # Example
+//!
+//! ```rust,ignore
+//! use kvbm::v2::distributed::collectives::{CollectiveOps, StubCollectiveOps};
+//!
+//! let collective = StubCollectiveOps::new(events);
+//!
+//! // Broadcast G1 blocks from rank 0 to all ranks
+//! let notification = collective.broadcast(
+//!     LogicalLayoutHandle::G1,
+//!     LogicalLayoutHandle::G1,
+//!     &src_block_ids,
+//!     &dst_block_ids,
+//!     Some(0..32),
+//! )?;
+//! notification.await_completion()?;
+//! ```
+
+mod stub;
+
+#[cfg(feature = "nccl")]
+mod bootstrap;
+#[cfg(feature = "nccl")]
+mod nccl;
+
+pub use stub::StubCollectiveOps;
+
+#[cfg(feature = "nccl")]
+pub use bootstrap::NcclBootstrap;
+#[cfg(feature = "nccl")]
+pub use nccl::{CudaEventRegistrar, LayoutResolver, NcclCollectives};
+
+use std::ops::Range;
+
+use anyhow::Result;
+
+use crate::BlockId;
+use kvbm_common::LogicalLayoutHandle;
+use kvbm_physical::transfer::TransferCompleteNotification;
+
+/// Collective communication operations for distributed workers.
+///
+/// This trait defines the collective operations needed by replicated data workers
+/// to broadcast data across ranks. Implementations may use NCCL, NIXL, or other
+/// collective communication libraries.
+///
+/// # Thread Safety
+///
+/// Implementations must be `Send + Sync` to allow sharing across threads.
+/// NCCL operations are inherently thread-safe when used correctly (one stream
+/// per communicator per thread).
+pub trait CollectiveOps: Send + Sync {
+    /// Broadcast blocks from rank 0 to all other ranks.
+    ///
+    /// This operation transfers the specified blocks from the source layout on
+    /// rank 0 to the destination layout on all other ranks. Optionally, a layer
+    /// range can be specified to transfer only a subset of layers (for pipelined
+    /// loading).
+    ///
+    /// # Arguments
+    /// * `src` - The source logical layout (typically G1 on rank 0)
+    /// * `dst` - The destination logical layout (typically G1 on all ranks)
+    /// * `src_block_ids` - The block IDs to read from on the source
+    /// * `dst_block_ids` - The block IDs to write to on the destination
+    /// * `layer_range` - Optional range of layers to transfer. If None, all layers are transferred.
+    ///
+    /// # Returns
+    /// A notification that completes when the broadcast is done on all ranks.
+    ///
+    /// # Synchronization
+    ///
+    /// This is a collective operation - all ranks must call this method with
+    /// the same arguments for the broadcast to complete correctly. The returned
+    /// notification signals local completion; global completion is guaranteed
+    /// by the collective semantics of the underlying implementation.
+    fn broadcast(
+        &self,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: &[BlockId],
+        dst_block_ids: &[BlockId],
+        layer_range: Option<Range<usize>>,
+    ) -> Result<TransferCompleteNotification>;
+
+    /// Get the rank of this worker in the collective group.
+    fn rank(&self) -> usize;
+
+    /// Get the total number of workers in the collective group.
+    fn world_size(&self) -> usize;
+}
--- a/lib/kvbm-engine/src/collectives/nccl.rs
+++ b/lib/kvbm-engine/src/collectives/nccl.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! NCCL-based collective operations for GPU-to-GPU communication.
+//!
+//! This module provides [`NcclCollectives`], an implementation of [`CollectiveOps`]
+//! that uses NVIDIA NCCL for efficient GPU collective communication.
+//!
+//! # Construction Paths
+//!
+//! NCCL communicators can be obtained via two paths:
+//!
+//! ## Path A: Bootstrap (tests and standalone Rust apps)
+//!
+//! Use [`NcclCollectives::from_bootstrap`] when creating communicators from scratch:
+//!
+//! ```rust,ignore
+//! let bootstrap = NcclBootstrap::generate(world_size)?;
+//! // ... distribute bootstrap to other ranks ...
+//! let collectives = NcclCollectives::from_bootstrap(
+//!     &bootstrap,
+//!     rank,
+//!     cuda_context,
+//!     event_registrar,
+//!     layout_resolver,
+//! )?;
+//! ```
+//!
+//! ## Path B: Borrowed handles (production with PyTorch/vLLM/TensorRT-LLM)
+//!
+//! Use [`NcclCollectives::from_borrowed`] when an external runtime provides the communicator:
+//!
+//! ```rust,ignore
+//! // In Python: comm_ptr = dist._get_default_group()._get_backend()._nccl_comm.as_int()
+//! let collectives = unsafe {
+//!     NcclCollectives::from_borrowed(
+//!         comm_ptr,
+//!         stream_ptr,
+//!         rank,
+//!         world_size,
+//!         cuda_context,
+//!         event_registrar,
+//!         layout_resolver,
+//!     )
+//! };
+//! ```
+//!
+//! # Thread Safety
+//!
+//! NCCL operations are thread-safe when each thread uses its own stream. This
+//! implementation uses a dedicated NCCL stream per `NcclCollectives` instance.
+
+use std::ops::Range;
+use std::sync::Arc;
+
+use anyhow::{Context, Result};
+use cudarc::driver::sys::CUstream;
+use cudarc::driver::{CudaContext, CudaEvent, CudaStream};
+use cudarc::nccl::sys::{
+    ncclBcast, ncclComm_t, ncclCommDestroy, ncclDataType_t, ncclGroupEnd, ncclGroupStart,
+};
+use velo::EventManager;
+
+use crate::BlockId;
+use kvbm_common::LogicalLayoutHandle;
+use kvbm_physical::layout::PhysicalLayout;
+use kvbm_physical::transfer::TransferCompleteNotification;
+
+use super::CollectiveOps;
+use super::bootstrap::{NcclBootstrap, check_nccl_result};
+
+/// Trait for resolving logical layout handles to physical layouts.
+///
+/// This trait decouples [`NcclCollectives`] from [`PhysicalWorker`], allowing
+/// the collective operations to work with any layout resolution strategy.
+pub trait LayoutResolver: Send + Sync {
+    /// Resolve a logical layout handle to a physical layout.
+    ///
+    /// # Arguments
+    /// * `logical` - The logical layout handle (G1, G2, G3)
+    ///
+    /// # Returns
+    /// The physical layout for the given logical handle, or an error if not found.
+    fn resolve_layout(&self, logical: LogicalLayoutHandle) -> Result<PhysicalLayout>;
+}
+
+/// Trait for registering CUDA events for completion notification.
+///
+/// This trait abstracts the CUDA event registration mechanism, allowing
+/// [`NcclCollectives`] to integrate with different event polling systems.
+/// Implementations should use efficient background polling rather than
+/// spawning individual tasks per event.
+///
+/// The primary implementation wraps `TransferContext::register_cuda_event`,
+/// which uses a shared background task for polling multiple events.
+pub trait CudaEventRegistrar: Send + Sync {
+    /// Register a CUDA event for completion notification.
+    ///
+    /// The returned notification will complete when the CUDA event has been
+    /// signaled (i.e., all operations recorded before the event have completed).
+    ///
+    /// # Arguments
+    /// * `event` - The CUDA event to monitor
+    ///
+    /// # Returns
+    /// A notification that completes when the event is signaled.
+    fn register_cuda_event(&self, event: CudaEvent) -> TransferCompleteNotification;
+}
+
+/// Ownership mode for the NCCL communicator.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum CommOwnership {
+    /// We own the communicator and must destroy it on drop.
+    Owned,
+    /// The communicator is borrowed from external code (PyTorch, etc.).
+    Borrowed,
+}
+
+/// Stream wrapper that can be either owned or borrowed.
+enum NcclStream {
+    /// Owned CudaStream - we control its lifetime
+    Owned(Arc<CudaStream>),
+    /// Borrowed raw stream pointer - caller controls lifetime
+    Borrowed(CUstream),
+}
+
+impl NcclStream {
+    /// Get the raw CUDA stream pointer for NCCL calls.
+    fn raw(&self) -> CUstream {
+        match self {
+            NcclStream::Owned(stream) => stream.cu_stream(),
+            NcclStream::Borrowed(ptr) => *ptr,
+        }
+    }
+
+    /// Get the owned stream (for recording events). Only available for owned streams.
+    fn as_owned(&self) -> Option<&Arc<CudaStream>> {
+        match self {
+            NcclStream::Owned(stream) => Some(stream),
+            NcclStream::Borrowed(_) => None,
+        }
+    }
+}
+
+/// NCCL-based collective operations for GPU-to-GPU communication.
+///
+/// This implementation uses NVIDIA NCCL for efficient broadcast operations
+/// across GPUs. It supports both owned communicators (created via bootstrap)
+/// and borrowed communicators (from PyTorch, vLLM, etc.).
+///
+/// # Performance
+///
+/// Broadcast operations use NCCL groups to batch multiple memory region
+/// transfers into a single collective operation, minimizing synchronization
+/// overhead.
+pub struct NcclCollectives {
+    /// NCCL communicator handle
+    comm: ncclComm_t,
+
+    /// Whether we own the communicator (and must destroy it on drop)
+    ownership: CommOwnership,
+
+    /// Rank of this worker in the collective group
+    rank: usize,
+
+    /// Total number of workers in the collective group
+    world_size: usize,
+
+    /// CUDA stream for NCCL operations (owned or borrowed)
+    nccl_stream: NcclStream,
+
+    /// CUDA context for stream/event management (only used for owned mode)
+    #[allow(dead_code)]
+    cuda_context: Arc<CudaContext>,
+
+    /// Event system for completion notifications (used for borrowed stream fallback)
+    event_system: EventManager,
+
+    /// CUDA event registrar for efficient completion notification
+    event_registrar: Arc<dyn CudaEventRegistrar>,
+
+    /// Layout resolver for mapping logical handles to physical layouts
+    layout_resolver: Arc<dyn LayoutResolver>,
+}
+
+impl NcclCollectives {
+    // =========================================================================
+    // Path A: Create from scratch (used by tests, standalone Rust apps)
+    // =========================================================================
+
+    /// Create with a new NCCL communicator initialized from bootstrap info.
+    ///
+    /// This is a **collective operation** - all ranks must call simultaneously
+    /// with the same bootstrap data for initialization to succeed.
+    ///
+    /// # Arguments
+    /// * `bootstrap` - Bootstrap data containing the NCCL unique ID
+    /// * `rank` - The rank of this worker (0 to world_size-1)
+    /// * `cuda_context` - CUDA context for stream management
+    /// * `event_system` - Event system for fallback completion notifications
+    /// * `event_registrar` - Registrar for efficient CUDA event completion polling
+    /// * `layout_resolver` - Resolver for mapping logical handles to physical layouts
+    ///
+    /// # Returns
+    /// A new `NcclCollectives` instance that owns its communicator.
+    ///
+    /// # Errors
+    /// Returns an error if NCCL initialization fails.
+    pub fn from_bootstrap(
+        bootstrap: &NcclBootstrap,
+        rank: usize,
+        cuda_context: Arc<CudaContext>,
+        event_system: EventManager,
+        event_registrar: Arc<dyn CudaEventRegistrar>,
+        layout_resolver: Arc<dyn LayoutResolver>,
+    ) -> Result<Self> {
+        let nccl_stream = cuda_context
+            .new_stream()
+            .context("Failed to create NCCL stream")?;
+
+        let comm = bootstrap
+            .init_communicator(rank, nccl_stream.cu_stream())
+            .context("Failed to initialize NCCL communicator")?;
+
+        Ok(Self {
+            comm,
+            ownership: CommOwnership::Owned,
+            rank,
+            world_size: bootstrap.world_size(),
+            nccl_stream: NcclStream::Owned(nccl_stream),
+            cuda_context,
+            event_system,
+            event_registrar,
+            layout_resolver,
+        })
+    }
+
+    // =========================================================================
+    // Path B: Borrow existing communicator (production use with Python/C/C++)
+    // =========================================================================
+
+    /// Create from borrowed NCCL handles passed from external code.
+    ///
+    /// This is the primary production path when the NCCL communicator is
+    /// initialized by Python (torch.distributed), C++, or another runtime.
+    ///
+    /// # Arguments
+    /// * `comm_ptr` - Raw pointer to `ncclComm_t` handle (cast to usize)
+    /// * `stream_ptr` - Raw pointer to `cudaStream_t` handle (cast to usize)
+    /// * `rank` - The rank of this worker in the collective group
+    /// * `world_size` - Total number of workers in the collective group
+    /// * `cuda_context` - CUDA context for event management
+    /// * `event_system` - Event system for fallback completion notifications
+    /// * `event_registrar` - Registrar for efficient CUDA event completion polling
+    /// * `layout_resolver` - Resolver for mapping logical handles to physical layouts
+    ///
+    /// # Safety
+    /// - `comm_ptr` must be a valid `ncclComm_t` handle
+    /// - `stream_ptr` must be a valid `cudaStream_t` handle
+    /// - The caller must ensure the handles outlive this struct
+    /// - The communicator must not be destroyed while this struct exists
+    ///
+    /// # FFI Example (Python via PyO3)
+    /// ```python
+    /// # In Python
+    /// comm = torch.distributed.distributed_c10d._get_default_group()._get_backend()._nccl_comm
+    /// stream = torch.cuda.current_stream()
+    ///
+    /// # Pass to Rust
+    /// collectives = NcclCollectives.from_borrowed(
+    ///     comm_ptr=comm.as_int(),
+    ///     stream_ptr=stream.cuda_stream,
+    ///     rank=rank,
+    ///     world_size=world_size,
+    /// )
+    /// ```
+    ///
+    /// # FFI Example (C/C++)
+    /// ```c
+    /// // In C/C++
+    /// ncclComm_t comm;
+    /// ncclCommInitRank(&comm, world_size, id, rank);
+    /// cudaStream_t stream;
+    /// cudaStreamCreate(&stream);
+    ///
+    /// // Pass to Rust via FFI
+    /// nccl_collectives_from_borrowed((uintptr_t)comm, (uintptr_t)stream, rank, world_size);
+    /// ```
+    #[allow(clippy::too_many_arguments)]
+    pub unsafe fn from_borrowed(
+        comm_ptr: usize,
+        stream_ptr: usize,
+        rank: usize,
+        world_size: usize,
+        cuda_context: Arc<CudaContext>,
+        event_system: EventManager,
+        event_registrar: Arc<dyn CudaEventRegistrar>,
+        layout_resolver: Arc<dyn LayoutResolver>,
+    ) -> Self {
+        Self {
+            comm: comm_ptr as ncclComm_t,
+            ownership: CommOwnership::Borrowed,
+            rank,
+            world_size,
+            nccl_stream: NcclStream::Borrowed(stream_ptr as CUstream),
+            cuda_context,
+            event_system,
+            event_registrar,
+            layout_resolver,
+        }
+    }
+
+    /// Broadcast memory regions using NCCL grouped operations.
+    ///
+    /// # Arguments
+    /// * `regions` - Vector of (ptr, size) pairs for memory regions to broadcast
+    /// * `root` - Root rank for the broadcast
+    ///
+    /// # Errors
+    /// Returns an error if any NCCL operation fails.
+    fn broadcast_regions(&self, regions: &[(usize, usize)], root: i32) -> Result<()> {
+        if regions.is_empty() {
+            return Ok(());
+        }
+
+        let stream = self.nccl_stream.raw();
+
+        // Start NCCL group - batches operations for efficiency
+        let result = unsafe { ncclGroupStart() };
+        check_nccl_result(result).context("ncclGroupStart failed")?;
+
+        // Queue all broadcasts within the group
+        for (ptr, size) in regions {
+            // SAFETY: We're calling NCCL with valid pointers within a group operation.
+            // The stream cast is safe because both cudarc::driver::sys::CUstream and
+            // cudarc::nccl::sys::CUstream are the same underlying CUDA type (*mut CUstream_st).
+            let result = unsafe {
+                ncclBcast(
+                    *ptr as *mut std::ffi::c_void,
+                    *size,
+                    ncclDataType_t::ncclChar, // byte-level transfer
+                    root,
+                    self.comm,
+                    stream.cast(),
+                )
+            };
+            check_nccl_result(result).context("ncclBcast failed")?;
+        }
+
+        // End group - submits all queued ops to GPU
+        let result = unsafe { ncclGroupEnd() };
+        check_nccl_result(result).context("ncclGroupEnd failed")?;
+
+        Ok(())
+    }
+
+    /// Collect memory regions for a set of blocks and layers.
+    ///
+    /// # Arguments
+    /// * `layout` - Physical layout to query
+    /// * `block_ids` - Block IDs to collect regions for
+    /// * `layer_range` - Range of layers to include (None = all layers)
+    ///
+    /// # Returns
+    /// Vector of (address, size) pairs for the requested regions.
+    fn collect_regions(
+        &self,
+        layout: &PhysicalLayout,
+        block_ids: &[BlockId],
+        layer_range: Option<Range<usize>>,
+    ) -> Result<Vec<(usize, usize)>> {
+        let num_layers = layout.layout().num_layers();
+        let outer_dim = layout.layout().outer_dim();
+
+        let layer_range = layer_range.unwrap_or(0..num_layers);
+
+        let mut regions =
+            Vec::with_capacity(block_ids.len() * (layer_range.end - layer_range.start) * outer_dim);
+
+        for &block_id in block_ids {
+            for layer_id in layer_range.clone() {
+                for outer_id in 0..outer_dim {
+                    let region = layout.memory_region(block_id, layer_id, outer_id)?;
+                    regions.push((region.addr, region.size));
+                }
+            }
+        }
+
+        Ok(regions)
+    }
+
+    /// Create a completion notification by recording an event on the NCCL stream.
+    fn create_completion_notification(&self) -> Result<TransferCompleteNotification> {
+        // For owned streams, we can record an event and use the efficient registrar
+        if let Some(stream) = self.nccl_stream.as_owned() {
+            let cuda_event = stream
+                .record_event(None)
+                .context("Failed to record CUDA event")?;
+
+            // Use the event registrar for efficient background polling
+            Ok(self.event_registrar.register_cuda_event(cuda_event))
+        } else {
+            // For borrowed streams, we can't easily record events since we don't
+            // have ownership. Return an immediate completion notification.
+            // The caller is responsible for synchronization with the borrowed stream.
+            tracing::warn!(
+                "Using borrowed stream - returning immediate completion. \
+                 Caller must ensure stream synchronization."
+            );
+
+            let nova_event = self.event_system.new_event()?;
+            let handle = nova_event.handle();
+            nova_event.trigger()?;
+            let awaiter = self.event_system.awaiter(handle)?;
+            Ok(TransferCompleteNotification::from_awaiter(awaiter))
+        }
+    }
+}
+
+impl CollectiveOps for NcclCollectives {
+    fn broadcast(
+        &self,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: &[BlockId],
+        dst_block_ids: &[BlockId],
+        layer_range: Option<Range<usize>>,
+    ) -> Result<TransferCompleteNotification> {
+        // Resolve layouts
+        let src_layout = self.layout_resolver.resolve_layout(src)?;
+        let dst_layout = self.layout_resolver.resolve_layout(dst)?;
+
+        // For broadcast, rank 0 uses src, other ranks use dst
+        let layout = if self.rank == 0 {
+            &src_layout
+        } else {
+            &dst_layout
+        };
+
+        let block_ids = if self.rank == 0 {
+            src_block_ids
+        } else {
+            dst_block_ids
+        };
+
+        // Collect memory regions for the broadcast
+        let regions = self.collect_regions(layout, block_ids, layer_range)?;
+
+        tracing::debug!(
+            rank = self.rank,
+            world_size = self.world_size,
+            num_regions = regions.len(),
+            total_bytes = regions.iter().map(|(_, size)| size).sum::<usize>(),
+            "Starting NCCL broadcast"
+        );
+
+        // Execute grouped broadcast (rank 0 is always root for broadcast)
+        self.broadcast_regions(&regions, 0)?;
+
+        // Create completion notification
+        self.create_completion_notification()
+    }
+
+    fn rank(&self) -> usize {
+        self.rank
+    }
+
+    fn world_size(&self) -> usize {
+        self.world_size
+    }
+}
+
+impl Drop for NcclCollectives {
+    fn drop(&mut self) {
+        if self.ownership == CommOwnership::Owned {
+            // SAFETY: We own this communicator and it's valid
+            let result = unsafe { ncclCommDestroy(self.comm) };
+            if let Err(e) = check_nccl_result(result) {
+                tracing::warn!("Failed to destroy NCCL communicator: {:?}", e);
+            }
+        }
+    }
+}
+
+// SAFETY: NcclCollectives can be sent between threads.
+// The NCCL communicator itself is thread-safe when operations use
+// the same stream (which we guarantee by having a dedicated stream).
+unsafe impl Send for NcclCollectives {}
+
+// SAFETY: NcclCollectives can be shared between threads.
+// All mutable state is behind Arc or atomic operations, and NCCL
+// operations are thread-safe when using the same stream.
+unsafe impl Sync for NcclCollectives {}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use cudarc::driver::{CudaContext, CudaSlice, DevicePtr};
+    use cudarc::nccl::sys::{ncclCommDestroy, ncclCommInitAll};
+    use std::ffi::c_int;
+    use std::sync::{Arc, Barrier};
+    use std::thread;
+
+    /// Get the number of CUDA devices available.
+    fn cuda_device_count() -> usize {
+        CudaContext::device_count().unwrap_or(0) as usize
+    }
+
+    /// Initialize NCCL communicators for all devices using ncclCommInitAll.
+    ///
+    /// This is the single-process multi-GPU initialization pattern.
+    /// Returns a vector of communicator handles as usize (for Send).
+    unsafe fn init_all_comms(num_devices: usize) -> Result<Vec<usize>> {
+        let mut comms: Vec<ncclComm_t> = vec![std::ptr::null_mut(); num_devices];
+        let devices: Vec<c_int> = (0..num_devices as c_int).collect();
+
+        // SAFETY: ncclCommInitAll is safe to call with valid pointers
+        let result =
+            unsafe { ncclCommInitAll(comms.as_mut_ptr(), num_devices as c_int, devices.as_ptr()) };
+
+        check_nccl_result(result).context("ncclCommInitAll failed")?;
+        // Convert to usize for Send
+        Ok(comms.into_iter().map(|c| c as usize).collect())
+    }
+
+    /// Clean up NCCL communicators.
+    unsafe fn destroy_comms(comms: &[usize]) {
+        for &comm in comms {
+            // SAFETY: Converting back from usize and destroying
+            unsafe {
+                let _ = ncclCommDestroy(comm as ncclComm_t);
+            }
+        }
+    }
+
+    /// Helper to get device pointer from CudaSlice with stream.
+    fn get_device_ptr(slice: &CudaSlice<u8>, stream: &CudaStream) -> usize {
+        let (ptr, _guard) = slice.device_ptr(stream);
+        ptr as usize
+    }
+
+    // NOTE: These NCCL tests require a full NCCL installation with all symbols.
+    // Some stripped NCCL builds (e.g., Lambda Labs' 2.26.2-0lambda1) are missing
+    // ncclAlltoAll, ncclGather, ncclScatter, etc. which cudarc requires.
+    // If tests fail with "undefined symbol: ncclAlltoAll", install official NVIDIA NCCL.
+
+    #[test]
+    #[cfg(feature = "testing-nccl")]
+    fn test_nccl_broadcast_multi_gpu_raw() {
+        // Skip if < 2 GPUs available
+        let num_devices = cuda_device_count();
+        if num_devices < 2 {
+            println!(
+                "Skipping test: {} GPUs available, need at least 2",
+                num_devices
+            );
+            return;
+        }
+
+        // Use 2 GPUs for the test
+        let world_size = 2;
+        println!("Testing NCCL broadcast with {} GPUs", world_size);
+
+        // Initialize all communicators at once (single-process pattern)
+        let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
+
+        // Create CUDA contexts and streams for each device
+        let contexts: Vec<Arc<CudaContext>> = (0..world_size)
+            .map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
+            .collect();
+
+        let streams: Vec<Arc<CudaStream>> = contexts
+            .iter()
+            .map(|ctx| ctx.new_stream().expect("Failed to create stream"))
+            .collect();
+
+        // Test data
+        let test_size = 1024 * 1024; // 1 MB
+        let test_pattern: u8 = 0xAB;
+
+        // Allocate device buffers using streams
+        let buffers: Vec<CudaSlice<u8>> = streams
+            .iter()
+            .map(|stream| {
+                // Allocate zeroed buffer
+                let zeros = vec![0u8; test_size];
+                stream
+                    .clone_htod(&zeros)
+                    .expect("Failed to allocate buffer")
+            })
+            .collect();
+
+        // Fill rank 0's buffer with test pattern
+        {
+            let host_data = vec![test_pattern; test_size];
+            let buffer = streams[0]
+                .clone_htod(&host_data)
+                .expect("Failed to copy to device 0");
+            // Copy to actual buffer location
+            let src_ptr = get_device_ptr(&buffer, &streams[0]);
+            let dst_ptr = get_device_ptr(&buffers[0], &streams[0]);
+            unsafe {
+                cudarc::driver::result::memcpy_dtod_async(
+                    dst_ptr as u64,
+                    src_ptr as u64,
+                    test_size,
+                    streams[0].cu_stream(),
+                )
+                .expect("dtod copy failed");
+            }
+            streams[0].synchronize().expect("sync failed");
+        }
+
+        // Get buffer pointers before spawning threads (to avoid lifetime issues)
+        let buffer_ptrs: Vec<usize> = buffers
+            .iter()
+            .zip(streams.iter())
+            .map(|(buf, stream)| get_device_ptr(buf, stream))
+            .collect();
+
+        // Synchronization barrier for threads
+        let barrier = Arc::new(Barrier::new(world_size));
+
+        // Spawn threads to perform broadcast
+        let handles: Vec<_> = (0..world_size)
+            .map(|rank| {
+                let comm = comms[rank]; // Already usize, which is Send
+                let stream = streams[rank].clone();
+                let buffer_ptr = buffer_ptrs[rank];
+                let barrier = barrier.clone();
+
+                thread::spawn(move || {
+                    // Wait for all threads to be ready
+                    barrier.wait();
+
+                    // Perform broadcast (rank 0 is root)
+                    let result = unsafe {
+                        ncclBcast(
+                            buffer_ptr as *mut std::ffi::c_void,
+                            test_size,
+                            ncclDataType_t::ncclChar,
+                            0,                  // root rank
+                            comm as ncclComm_t, // Convert back to ncclComm_t
+                            stream.cu_stream().cast(),
+                        )
+                    };
+
+                    check_nccl_result(result).expect("ncclBcast failed");
+
+                    // Synchronize stream
+                    stream.synchronize().expect("Stream sync failed");
+
+                    println!("Rank {} completed broadcast", rank);
+                })
+            })
+            .collect();
+
+        // Wait for all threads to complete
+        for handle in handles {
+            handle.join().expect("Thread panicked");
+        }
+
+        // Verify all buffers have the test pattern
+        for (rank, (stream, buffer)) in streams.iter().zip(buffers.iter()).enumerate() {
+            let host_data = stream
+                .clone_dtoh(buffer)
+                .expect("Failed to copy from device");
+
+            // Check first and last bytes, plus some random samples
+            assert_eq!(
+                host_data[0], test_pattern,
+                "Rank {} first byte mismatch",
+                rank
+            );
+            assert_eq!(
+                host_data[test_size - 1],
+                test_pattern,
+                "Rank {} last byte mismatch",
+                rank
+            );
+            assert_eq!(
+                host_data[test_size / 2],
+                test_pattern,
+                "Rank {} middle byte mismatch",
+                rank
+            );
+
+            // Verify all bytes
+            let mismatch_count = host_data.iter().filter(|&&b| b != test_pattern).count();
+            assert_eq!(
+                mismatch_count, 0,
+                "Rank {} has {} mismatched bytes",
+                rank, mismatch_count
+            );
+
+            println!("Rank {} verified: all {} bytes correct", rank, test_size);
+        }
+
+        // Clean up
+        unsafe { destroy_comms(&comms) };
+        println!("Test passed!");
+    }
+
+    #[test]
+    #[cfg(feature = "testing-nccl")]
+    fn test_nccl_grouped_broadcast_multi_gpu() {
+        // Skip if < 2 GPUs available
+        let num_devices = cuda_device_count();
+        if num_devices < 2 {
+            println!(
+                "Skipping test: {} GPUs available, need at least 2",
+                num_devices
+            );
+            return;
+        }
+
+        // Use 2 GPUs for the test
+        let world_size = 2;
+        println!("Testing NCCL grouped broadcast with {} GPUs", world_size);
+
+        // Initialize all communicators at once
+        let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
+
+        // Create CUDA contexts and streams
+        let contexts: Vec<Arc<CudaContext>> = (0..world_size)
+            .map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
+            .collect();
+
+        let streams: Vec<Arc<CudaStream>> = contexts
+            .iter()
+            .map(|ctx| ctx.new_stream().expect("Failed to create stream"))
+            .collect();
+
+        // Test multiple regions (simulating multiple blocks)
+        let num_regions = 4;
+        let region_size = 256 * 1024; // 256 KB per region
+
+        // Allocate multiple buffers per device
+        let buffers: Vec<Vec<CudaSlice<u8>>> = streams
+            .iter()
+            .map(|stream| {
+                (0..num_regions)
+                    .map(|_| {
+                        let zeros = vec![0u8; region_size];
+                        stream.clone_htod(&zeros).expect("Failed to allocate")
+                    })
+                    .collect()
+            })
+            .collect();
+
+        // Fill rank 0's buffers with different patterns
+        for (region_idx, buffer) in buffers[0].iter().enumerate() {
+            let pattern = (region_idx + 1) as u8 * 0x11; // Different pattern per region
+            let host_data = vec![pattern; region_size];
+            let src_buffer = streams[0]
+                .clone_htod(&host_data)
+                .expect("Failed to allocate src");
+
+            let src_ptr = get_device_ptr(&src_buffer, &streams[0]);
+            let dst_ptr = get_device_ptr(buffer, &streams[0]);
+            unsafe {
+                cudarc::driver::result::memcpy_dtod_async(
+                    dst_ptr as u64,
+                    src_ptr as u64,
+                    region_size,
+                    streams[0].cu_stream(),
+                )
+                .expect("dtod copy failed");
+            }
+        }
+        streams[0].synchronize().expect("sync failed");
+
+        // Synchronization barrier
+        let barrier = Arc::new(Barrier::new(world_size));
+
+        // Collect buffer pointers for each rank (as usize for Send)
+        let buffer_ptrs: Vec<Vec<usize>> = buffers
+            .iter()
+            .zip(streams.iter())
+            .map(|(rank_buffers, stream)| {
+                rank_buffers
+                    .iter()
+                    .map(|b| get_device_ptr(b, stream))
+                    .collect()
+            })
+            .collect();
+
+        // Spawn threads for grouped broadcast
+        let handles: Vec<_> = (0..world_size)
+            .map(|rank| {
+                let comm = comms[rank]; // Already usize, which is Send
+                let stream = streams[rank].clone();
+                let ptrs = buffer_ptrs[rank].clone();
+                let barrier = barrier.clone();
+
+                thread::spawn(move || {
+                    barrier.wait();
+
+                    // Use NCCL group for multiple broadcasts
+                    unsafe {
+                        check_nccl_result(ncclGroupStart()).expect("ncclGroupStart failed");
+
+                        for ptr in &ptrs {
+                            let result = ncclBcast(
+                                *ptr as *mut std::ffi::c_void,
+                                region_size,
+                                ncclDataType_t::ncclChar,
+                                0,
+                                comm as ncclComm_t,
+                                stream.cu_stream().cast(),
+                            );
+                            check_nccl_result(result).expect("ncclBcast failed");
+                        }
+
+                        check_nccl_result(ncclGroupEnd()).expect("ncclGroupEnd failed");
+                    }
+
+                    stream.synchronize().expect("Stream sync failed");
+                    println!("Rank {} completed grouped broadcast", rank);
+                })
+            })
+            .collect();
+
+        for handle in handles {
+            handle.join().expect("Thread panicked");
+        }
+
+        // Verify all ranks have correct data
+        for (rank, (stream, rank_buffers)) in streams.iter().zip(buffers.iter()).enumerate() {
+            for (region_idx, buffer) in rank_buffers.iter().enumerate() {
+                let expected_pattern = (region_idx + 1) as u8 * 0x11;
+                let host_data = stream
+                    .clone_dtoh(buffer)
+                    .expect("Failed to copy from device");
+
+                let mismatch_count = host_data.iter().filter(|&&b| b != expected_pattern).count();
+                assert_eq!(
+                    mismatch_count, 0,
+                    "Rank {} region {} has {} mismatched bytes (expected 0x{:02x})",
+                    rank, region_idx, mismatch_count, expected_pattern
+                );
+            }
+            println!(
+                "Rank {} verified: all {} regions correct",
+                rank, num_regions
+            );
+        }
+
+        unsafe { destroy_comms(&comms) };
+        println!("Grouped broadcast test passed!");
+    }
+
+    #[test]
+    #[cfg(feature = "testing-nccl")]
+    fn test_nccl_broadcast_large_transfer() {
+        // Skip if < 2 GPUs available
+        let num_devices = cuda_device_count();
+        if num_devices < 2 {
+            println!(
+                "Skipping test: {} GPUs available, need at least 2",
+                num_devices
+            );
+            return;
+        }
+
+        let world_size = 2;
+        println!("Testing NCCL large broadcast with {} GPUs", world_size);
+
+        let comms = unsafe { init_all_comms(world_size) }.expect("Failed to init NCCL comms");
+
+        let contexts: Vec<Arc<CudaContext>> = (0..world_size)
+            .map(|i| CudaContext::new(i).expect("Failed to create CUDA context"))
+            .collect();
+
+        let streams: Vec<Arc<CudaStream>> = contexts
+            .iter()
+            .map(|ctx| ctx.new_stream().expect("Failed to create stream"))
+            .collect();
+
+        // Large transfer: 64 MB (typical KV cache block size)
+        let test_size = 64 * 1024 * 1024;
+        println!("Transfer size: {} MB", test_size / (1024 * 1024));
+
+        // Allocate buffers
+        let buffers: Vec<CudaSlice<u8>> = streams
+            .iter()
+            .map(|stream| {
+                let zeros = vec![0u8; test_size];
+                stream.clone_htod(&zeros).expect("Failed to allocate")
+            })
+            .collect();
+
+        // Fill rank 0 with pseudo-random pattern
+        {
+            let host_data: Vec<u8> = (0..test_size).map(|i| (i % 256) as u8).collect();
+            let src_buffer = streams[0]
+                .clone_htod(&host_data)
+                .expect("Failed to copy to device 0");
+
+            let src_ptr = get_device_ptr(&src_buffer, &streams[0]);
+            let dst_ptr = get_device_ptr(&buffers[0], &streams[0]);
+            unsafe {
+                cudarc::driver::result::memcpy_dtod_async(
+                    dst_ptr as u64,
+                    src_ptr as u64,
+                    test_size,
+                    streams[0].cu_stream(),
+                )
+                .expect("dtod copy failed");
+            }
+            streams[0].synchronize().expect("sync failed");
+        }
+
+        // Get buffer pointers
+        let buffer_ptrs: Vec<usize> = buffers
+            .iter()
+            .zip(streams.iter())
+            .map(|(buf, stream)| get_device_ptr(buf, stream))
+            .collect();
+
+        let barrier = Arc::new(Barrier::new(world_size));
+
+        let start = std::time::Instant::now();
+
+        // Spawn threads for large transfer
+        let handles: Vec<_> = (0..world_size)
+            .map(|rank| {
+                let comm = comms[rank]; // Already usize, which is Send
+                let stream = streams[rank].clone();
+                let buffer_ptr = buffer_ptrs[rank];
+                let barrier = barrier.clone();
+
+                thread::spawn(move || {
+                    barrier.wait();
+
+                    let result = unsafe {
+                        ncclBcast(
+                            buffer_ptr as *mut std::ffi::c_void,
+                            test_size,
+                            ncclDataType_t::ncclChar,
+                            0,
+                            comm as ncclComm_t,
+                            stream.cu_stream().cast(),
+                        )
+                    };
+                    check_nccl_result(result).expect("ncclBcast failed");
+                    stream.synchronize().expect("Stream sync failed");
+                })
+            })
+            .collect();
+
+        for handle in handles {
+            handle.join().expect("Thread panicked");
+        }
+
+        let elapsed = start.elapsed();
+        let throughput_gbps =
+            (test_size as f64 / (1024.0 * 1024.0 * 1024.0)) / elapsed.as_secs_f64();
+        println!(
+            "Transfer completed in {:?} ({:.2} GB/s)",
+            elapsed, throughput_gbps
+        );
+
+        // Verify data on rank 1
+        {
+            let host_data = streams[1]
+                .clone_dtoh(&buffers[1])
+                .expect("Failed to copy from device 1");
+
+            // Sample verification (checking every byte would be slow)
+            let samples = [
+                0,
+                test_size / 4,
+                test_size / 2,
+                test_size * 3 / 4,
+                test_size - 1,
+            ];
+            for &idx in &samples {
+                let expected = (idx % 256) as u8;
+                assert_eq!(
+                    host_data[idx], expected,
+                    "Mismatch at index {}: expected {}, got {}",
+                    idx, expected, host_data[idx]
+                );
+            }
+
+            // Full verification with sampling
+            let mismatch_count = host_data
+                .iter()
+                .enumerate()
+                .filter(|(i, b)| **b != (*i % 256) as u8)
+                .count();
+            assert_eq!(
+                mismatch_count, 0,
+                "Found {} mismatched bytes",
+                mismatch_count
+            );
+        }
+
+        unsafe { destroy_comms(&comms) };
+        println!("Large transfer test passed!");
+    }
+}
--- a/lib/kvbm-engine/src/collectives/stub.rs
+++ b/lib/kvbm-engine/src/collectives/stub.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Stub collective operations implementation.
+//!
+//! This module provides a no-op implementation of [`CollectiveOps`] for testing
+//! and single-worker scenarios where no actual collective communication is needed.
+
+use std::ops::Range;
+
+use anyhow::Result;
+use velo::EventManager;
+
+use crate::BlockId;
+use kvbm_common::LogicalLayoutHandle;
+use kvbm_physical::transfer::TransferCompleteNotification;
+
+use super::CollectiveOps;
+
+/// Stub collective operations implementation.
+///
+/// This implementation completes immediately without actually performing any
+/// collective communication. Use for testing or when collective operations
+/// are not yet implemented (e.g., before NCCL integration).
+///
+/// # Safety
+///
+/// This stub does NOT perform actual data transfer. Using it in production
+/// with `ReplicatedDataWorker` will result in incorrect behavior where
+/// non-rank-0 workers have uninitialized data.
+///
+/// # Example
+///
+/// ```rust,ignore
+/// use kvbm::v2::distributed::collectives::StubCollectiveOps;
+///
+/// let collective = StubCollectiveOps::new(events, 0, 1);
+///
+/// // Operations complete immediately without data transfer
+/// let notification = collective.broadcast(
+///     LogicalLayoutHandle::G1,
+///     LogicalLayoutHandle::G1,
+///     &src_block_ids,
+///     &dst_block_ids,
+///     None,
+/// )?;
+/// ```
+pub struct StubCollectiveOps {
+    events: EventManager,
+    rank: usize,
+    world_size: usize,
+}
+
+impl StubCollectiveOps {
+    /// Create a new stub collective ops.
+    ///
+    /// # Arguments
+    /// * `events` - The event system for creating completion notifications
+    /// * `rank` - The rank of this worker in the collective group
+    /// * `world_size` - The total number of workers in the collective group
+    pub fn new(events: EventManager, rank: usize, world_size: usize) -> Self {
+        Self {
+            events,
+            rank,
+            world_size,
+        }
+    }
+
+    /// Create a stub for single-worker scenarios (rank 0, world_size 1).
+    pub fn single_worker(events: EventManager) -> Self {
+        Self::new(events, 0, 1)
+    }
+}
+
+impl CollectiveOps for StubCollectiveOps {
+    fn broadcast(
+        &self,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: &[BlockId],
+        dst_block_ids: &[BlockId],
+        layer_range: Option<Range<usize>>,
+    ) -> Result<TransferCompleteNotification> {
+        tracing::warn!(
+            rank = self.rank,
+            world_size = self.world_size,
+            ?src,
+            ?dst,
+            num_src_blocks = src_block_ids.len(),
+            num_dst_blocks = dst_block_ids.len(),
+            ?layer_range,
+            "StubCollectiveOps::broadcast called - completing immediately without actual transfer"
+        );
+
+        // Create an event that's already triggered (immediate completion)
+        let event = self.events.new_event()?;
+        let handle = event.handle();
+        event.trigger()?;
+
+        let awaiter = self.events.awaiter(handle)?;
+        Ok(TransferCompleteNotification::from_awaiter(awaiter))
+    }
+
+    fn rank(&self) -> usize {
+        self.rank
+    }
+
+    fn world_size(&self) -> usize {
+        self.world_size
+    }
+}
--- a/lib/kvbm-engine/src/leader/accessor.rs
+++ b/lib/kvbm-engine/src/leader/accessor.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Block accessor for policy-based scanning.
+//!
+//! Provides a stateless interface for acquiring blocks from G2/G3 tiers.
+//! Designed for use with custom scanning policies that control iteration
+//! and can yield results incrementally.
+
+use crate::{BlockId, G2, G3, SequenceHash};
+use kvbm_common::LogicalLayoutHandle;
+use kvbm_logical::blocks::ImmutableBlock;
+
+use super::InstanceLeader;
+
+/// A block from either G2 or G3 tier.
+///
+/// Provides RAII ownership - blocks are released when dropped.
+#[derive(Debug)]
+pub enum TieredBlock {
+    /// Block from G2 (host memory) tier.
+    G2(ImmutableBlock<G2>),
+    /// Block from G3 (disk) tier.
+    G3(ImmutableBlock<G3>),
+}
+
+impl TieredBlock {
+    /// Get the storage tier of this block.
+    pub fn tier(&self) -> LogicalLayoutHandle {
+        match self {
+            TieredBlock::G2(_) => LogicalLayoutHandle::G2,
+            TieredBlock::G3(_) => LogicalLayoutHandle::G3,
+        }
+    }
+
+    /// Get the sequence hash.
+    pub fn sequence_hash(&self) -> SequenceHash {
+        match self {
+            TieredBlock::G2(b) => b.sequence_hash(),
+            TieredBlock::G3(b) => b.sequence_hash(),
+        }
+    }
+
+    /// Get the block ID.
+    pub fn block_id(&self) -> BlockId {
+        match self {
+            TieredBlock::G2(b) => b.block_id(),
+            TieredBlock::G3(b) => b.block_id(),
+        }
+    }
+
+    /// Get the position in the sequence (for ordering).
+    pub fn position(&self) -> u64 {
+        self.sequence_hash().position()
+    }
+
+    /// Check if this is a G2 block.
+    pub fn is_g2(&self) -> bool {
+        matches!(self, TieredBlock::G2(_))
+    }
+
+    /// Check if this is a G3 block.
+    pub fn is_g3(&self) -> bool {
+        matches!(self, TieredBlock::G3(_))
+    }
+
+    /// Convert to G2 block, consuming self.
+    pub fn into_g2(self) -> Option<ImmutableBlock<G2>> {
+        match self {
+            TieredBlock::G2(b) => Some(b),
+            TieredBlock::G3(_) => None,
+        }
+    }
+
+    /// Convert to G3 block, consuming self.
+    pub fn into_g3(self) -> Option<ImmutableBlock<G3>> {
+        match self {
+            TieredBlock::G3(b) => Some(b),
+            TieredBlock::G2(_) => None,
+        }
+    }
+}
+
+/// Stateless accessor for block acquisition.
+///
+/// Each method call is independent - no locks are held between calls.
+/// This enables parallel policy execution (e.g., with rayon).
+///
+/// # Thread Safety
+///
+/// `BlockAccessor` is `Send + Sync` because:
+/// - It only holds a shared reference to `InstanceLeader`
+/// - `InstanceLeader` contains `Arc<BlockManager<T>>` which is `Send + Sync`
+/// - All operations use internal locking per call
+/// - No mutable state is held between method calls
+pub struct BlockAccessor<'a> {
+    instance: &'a InstanceLeader,
+    touch: bool,
+}
+
+impl<'a> BlockAccessor<'a> {
+    /// Create a new accessor.
+    pub(crate) fn new(instance: &'a InstanceLeader, touch: bool) -> Self {
+        Self { instance, touch }
+    }
+
+    /// Find and take a block from G2 or G3.
+    ///
+    /// Searches G2 first, then G3 if not found. The block is acquired/removed
+    /// from the pool - caller owns via RAII until dropped.
+    ///
+    /// Returns `None` if the block is not found in either tier.
+    pub fn find(&self, hash: SequenceHash) -> Option<TieredBlock> {
+        // Try G2 first (match_blocks acquires the block)
+        let g2_matches = self.instance.g2_manager.match_blocks(&[hash]);
+        if let Some(block) = g2_matches.into_iter().next() {
+            return Some(TieredBlock::G2(block));
+        }
+
+        // Try G3 if available
+        if let Some(ref g3) = self.instance.g3_manager {
+            let g3_matches = g3.match_blocks(&[hash]);
+            if let Some(block) = g3_matches.into_iter().next() {
+                return Some(TieredBlock::G3(block));
+            }
+        }
+
+        None
+    }
+
+    /// Get the touch setting for this accessor.
+    ///
+    /// When `true`, frequency tracking is updated on block access
+    /// (affects MultiLRU eviction priority).
+    pub fn touch(&self) -> bool {
+        self.touch
+    }
+}
+
+// Safety: BlockAccessor is Send + Sync because:
+// - It only holds a shared reference to InstanceLeader
+// - InstanceLeader contains Arc<BlockManager<T>> which is Send + Sync
+// - All operations use internal locking per call (RwLock in InactivePool)
+// - No mutable state is held between method calls
+unsafe impl Send for BlockAccessor<'_> {}
+unsafe impl Sync for BlockAccessor<'_> {}
+
+/// Context for policy execution with result collection.
+///
+/// Provides access to the `BlockAccessor` for block lookups and a
+/// `yield_item` method for streaming results back to the caller.
+pub struct PolicyContext<'a, T> {
+    pub(crate) accessor: BlockAccessor<'a>,
+    pub(crate) results: Vec<T>,
+}
+
+impl<'a, T> PolicyContext<'a, T> {
+    /// Get access to the block accessor.
+    pub fn accessor(&self) -> &BlockAccessor<'a> {
+        &self.accessor
+    }
+
+    /// Yield a result item.
+    ///
+    /// Items are collected and returned as a `Vec<T>` when the policy completes.
+    pub fn yield_item(&mut self, item: T) {
+        self.results.push(item);
+    }
+
+    /// Yield multiple result items at once.
+    pub fn yield_items(&mut self, items: impl IntoIterator<Item = T>) {
+        self.results.extend(items);
+    }
+}
+
+// =============================================================================
+// TODO: Parallel policy support via rayon::scope
+//
+// Requirements to enable:
+// 1. Add `rayon` to Cargo.toml dependencies
+// 2. Ensure BlockAccessor is truly Send+Sync (verify internal locking is correct)
+// 3. Add feature flag `parallel` to gate this code
+// 4. Test thread-safety of concurrent BlockManager::match_blocks calls
+// 5. Benchmark to ensure parallel overhead is worth it (likely only for large hash sets)
+//
+// The design uses rayon::scope instead of par_chunks because:
+// - par_chunks could split across logical boundaries (e.g., middle of a contiguous run)
+// - rayon::scope lets the policy control parallelism granularity
+// - Policy can identify natural split points (e.g., gaps in position sequence)
+//
+// use std::sync::Mutex;
+// use rayon;
+//
+// /// Context for parallel policy execution.
+// /// Provides thread-safe result collection via Mutex.
+// pub struct ParallelPolicyContext<'a, 's, T> {
+//     pub(crate) accessor: &'a BlockAccessor<'a>,
+//     pub(crate) scope: &'s rayon::Scope<'s>,
+//     pub(crate) results: &'a Mutex<Vec<T>>,
+// }
+//
+// impl<'a, 's, T: Send> ParallelPolicyContext<'a, 's, T> {
+//     /// Get access to the block accessor.
+//     pub fn accessor(&self) -> &BlockAccessor<'a> {
+//         self.accessor
+//     }
+//
+//     /// Yield a result item (thread-safe).
+//     pub fn yield_item(&self, item: T) {
+//         self.results.lock().unwrap().push(item);
+//     }
+//
+//     /// Yield multiple result items (thread-safe, single lock acquisition).
+//     pub fn yield_items(&self, items: impl IntoIterator<Item = T>) {
+//         self.results.lock().unwrap().extend(items);
+//     }
+//
+//     /// Spawn parallel work within the rayon scope.
+//     ///
+//     /// The closure receives the accessor and results mutex, allowing it to
+//     /// perform lookups and yield items from a separate thread.
+//     ///
+//     /// # Example
+//     /// ```ignore
+//     /// ctx.spawn(|accessor, results| {
+//     ///     for hash in my_segment {
+//     ///         if let Some(block) = accessor.find(hash) {
+//     ///             results.lock().unwrap().push(block);
+//     ///         }
+//     ///     }
+//     /// });
+//     /// ```
+//     pub fn spawn<F>(&self, f: F)
+//     where
+//         F: FnOnce(&BlockAccessor, &Mutex<Vec<T>>) + Send + 'a,
+//     {
+//         let accessor = self.accessor;
+//         let results = self.results;
+//         self.scope.spawn(move |_| {
+//             f(accessor, results);
+//         });
+//     }
+// }
+// =============================================================================
--- a/lib/kvbm-engine/src/leader/instance.rs
+++ b/lib/kvbm-engine/src/leader/instance.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::collections::HashMap;
+
+use ::velo::Messenger;
+use anyhow::Result;
+use dashmap::DashMap;
+use tokio::sync::{Mutex, mpsc, watch};
+use uuid::Uuid;
+
+use std::sync::Arc;
+
+use crate::{
+    BlockId, G2, G3, InstanceId, SequenceHash, object::ObjectBlockOps, worker::RemoteDescriptor,
+};
+use kvbm_common::LogicalLayoutHandle;
+use kvbm_logical::{
+    blocks::{BlockRegistry, ImmutableBlock},
+    manager::BlockManager,
+};
+use kvbm_physical::transfer::{TransferCompleteNotification, TransferOptions};
+
+use kvbm_physical::manager::{LayoutHandle, SerializedLayout};
+
+use super::{
+    super::worker::Worker,
+    super::worker::group::{ParallelWorkers, SpmdParallelWorkers},
+    AsyncSessionResult,
+    FindMatchesOptions,
+    FindMatchesResult,
+    Leader,
+    OnboardingStatus,
+    ReadyResult,
+    // Legacy SessionHandle for deferred operations
+    SessionHandle as LegacySessionHandle,
+    SessionId,
+    StagingMode,
+    accessor::{BlockAccessor, PolicyContext},
+    session::{
+        BlockHolder, ControlRole, ControllableSessionOptions, ControllableSessionResult,
+        InitiatorSession, MessageTransport, OnboardMessage, OnboardSessionTx, ResponderSession,
+        ServerSession, ServerSessionHandle, ServerSessionOptions, SessionHandle, SessionMessage,
+        SessionMessageTx, SessionPhase, create_server_session, session_handle_state_channel,
+        session_message_channel,
+    },
+    velo::{ExportMetadataCallback, VeloLeaderService},
+};
+
+/// Primary leader implementation for the distributed KVBM system.
+///
+/// `InstanceLeader` coordinates block onboarding across local and remote
+/// instances. It owns a G2 (host memory) `BlockManager` and an optional G3
+/// (disk) `BlockManager`, a set of workers for executing physical transfers,
+/// and a parallel worker abstraction for multi-rank RDMA operations.
+///
+/// Key responsibilities:
+/// - **Block matching**: finding which requested sequence hashes are already
+///   cached locally (via `BlockAccessor` policies).
+/// - **Session management**: creating, attaching, and driving onboard sessions
+///   between endpoint (source) and controller (destination) roles.
+/// - **Remote connectivity**: exchanging serialized layout metadata with peer
+///   instances so workers can perform RDMA transfers.
+/// - **Velo RPC**: registering handlers via `VeloLeaderService` so remote
+///   leaders can initiate sessions and exchange metadata.
+#[derive(Clone)]
+pub struct InstanceLeader {
+    /// Nova instance for distributed communication.
+    messenger: Arc<Messenger>,
+
+    /// Block registry for deduplication.
+    #[allow(dead_code)]
+    pub(crate) registry: BlockRegistry,
+
+    /// G2 (host memory) block manager (wrapped in Arc since BlockManager doesn't implement Clone).
+    pub(crate) g2_manager: Arc<BlockManager<G2>>,
+
+    /// Optional G3 (disk) block manager
+    pub(crate) g3_manager: Option<Arc<BlockManager<G3>>>,
+
+    /// Workers for executing transfers (at least 1 required).
+    /// Multiple workers enable parallel transfers and redundancy.
+    workers: Vec<Arc<dyn Worker>>,
+
+    /// Parallel worker abstraction wrapping the workers.
+    /// Used for RDMA transfers with proper handle mapping storage.
+    parallel_worker: Option<Arc<dyn ParallelWorkers>>,
+
+    /// Map of active sessions (session_id -> message channel).
+    sessions: Arc<DashMap<SessionId, OnboardSessionTx>>,
+
+    /// Cached worker metadata (avoids querying workers repeatedly).
+    cached_worker_metadata: Option<Vec<SerializedLayout>>,
+
+    /// Map of session states for holding blocks alive (RAII).
+    session_states: Arc<DashMap<SessionId, SessionState>>,
+
+    /// List of remote leader instance IDs (mutable for post-construction configuration).
+    remote_leaders: Arc<std::sync::RwLock<Vec<InstanceId>>>,
+
+    /// Message transport for session communication.
+    transport: Arc<MessageTransport>,
+
+    // ========================================================================
+    // Unified Session Protocol
+    // ========================================================================
+    /// Map of session message receivers.
+    /// Used by SessionHandle/SessionEndpoint/ControllableSession.
+    session_sessions: Arc<DashMap<SessionId, SessionMessageTx>>,
+
+    // ========================================================================
+    // G4/Object Storage
+    // ========================================================================
+    /// Object storage client for G4 search and load operations.
+    /// Leader calls has_blocks on S3 directly, coordinates workers for get_blocks.
+    object_client: Option<Arc<dyn ObjectBlockOps>>,
+}
+
+/// Builder for InstanceLeader.
+#[derive(Default)]
+pub struct InstanceLeaderBuilder {
+    messenger: Option<Arc<Messenger>>,
+    registry: Option<BlockRegistry>,
+    g2_manager: Option<Arc<BlockManager<G2>>>,
+    g3_manager: Option<Arc<BlockManager<G3>>>,
+    workers: Vec<Arc<dyn Worker>>,
+    sessions: Option<Arc<DashMap<SessionId, OnboardSessionTx>>>,
+    remote_leaders: Option<Vec<InstanceId>>,
+    cached_worker_metadata: Option<Vec<SerializedLayout>>,
+    object_client: Option<Arc<dyn ObjectBlockOps>>,
+}
+
+impl InstanceLeaderBuilder {
+    /// Initialize builder with components from KvbmRuntime.
+    ///
+    /// This extracts Nova from the runtime. Use this when the runtime
+    /// has already been constructed and you want the leader to share
+    /// the same Nova instance for distributed communication.
+    ///
+    /// # Example
+    /// ```ignore
+    /// let runtime = KvbmRuntime::from_env_leader().await?;
+    /// let leader = InstanceLeaderBuilder::default()
+    ///     .with_runtime(&runtime)
+    ///     .g2_manager(g2_manager)
+    ///     .build()?;
+    /// ```
+    pub fn with_runtime(self, runtime: &crate::KvbmRuntime) -> Self {
+        self.messenger(runtime.messenger().clone())
+    }
+
+    pub fn messenger(mut self, messenger: Arc<Messenger>) -> Self {
+        self.messenger = Some(messenger);
+        self
+    }
+
+    pub fn registry(mut self, registry: BlockRegistry) -> Self {
+        self.registry = Some(registry);
+        self
+    }
+
+    pub fn with_g2_manager(mut self, manager: Option<BlockManager<G2>>) -> Self {
+        self.g2_manager = manager.map(Arc::new);
+        self
+    }
+
+    pub fn with_g3_manager(mut self, manager: Option<BlockManager<G3>>) -> Self {
+        self.g3_manager = manager.map(Arc::new);
+        self
+    }
+
+    pub fn g2_manager(mut self, manager: Arc<BlockManager<G2>>) -> Self {
+        self.g2_manager = Some(manager);
+        self
+    }
+
+    pub fn g3_manager(mut self, manager: Arc<BlockManager<G3>>) -> Self {
+        self.g3_manager = Some(manager);
+        self
+    }
+
+    /// Add a single worker (convenience method).
+    pub fn worker(mut self, worker: Arc<dyn Worker>) -> Self {
+        self.workers.push(worker);
+        self
+    }
+
+    /// Set all workers at once.
+    pub fn workers(mut self, workers: Vec<Arc<dyn Worker>>) -> Self {
+        self.workers = workers;
+        self
+    }
+
+    pub fn remote_leaders(mut self, leaders: Vec<InstanceId>) -> Self {
+        self.remote_leaders = Some(leaders);
+        self
+    }
+
+    /// Cache worker metadata upfront to avoid querying workers later.
+    ///
+    /// This is useful when workers have already exported metadata during initialization
+    /// (e.g., in the connector pattern where workers return metadata in their init response).
+    pub fn with_cached_worker_metadata(mut self, metadata: Vec<SerializedLayout>) -> Self {
+        self.cached_worker_metadata = Some(metadata);
+        self
+    }
+
+    /// Set the object storage client for G4 search and load operations.
+    ///
+    /// The leader uses this client to:
+    /// - Query S3 for block presence via `has_blocks`
+    /// - Coordinate workers to load blocks from S3 via `get_blocks`
+    pub fn object_client(mut self, client: Arc<dyn ObjectBlockOps>) -> Self {
+        self.object_client = Some(client);
+        self
+    }
+
+    pub fn build(self) -> Result<InstanceLeader> {
+        let messenger = self
+            .messenger
+            .ok_or_else(|| anyhow::anyhow!("Nova instance required"))?;
+        let transport = Arc::new(MessageTransport::velo(messenger.clone()));
+
+        // Create event system for notification aggregation
+        let events = Arc::new(messenger.event_manager());
+
+        // Get current tokio runtime handle
+        let runtime = tokio::runtime::Handle::current();
+
+        // // Validate at least one worker
+        // if self.workers.is_empty() {
+        //     anyhow::bail!("At least one worker required");
+        // }
+
+        // todo: we will need a common builder pattern for creating "general" parallel workers
+        // - we could also use an enum and match as the number of types will be limited
+
+        // Create parallel worker if workers are provided
+        let parallel_worker: Option<Arc<dyn ParallelWorkers>> = if !self.workers.is_empty() {
+            Some(Arc::new(SpmdParallelWorkers::new(
+                self.workers.to_vec(),
+                events.clone(),
+                runtime.clone(),
+            )))
+        } else {
+            None
+        };
+
+        Ok(InstanceLeader {
+            messenger,
+            registry: self
+                .registry
+                .ok_or_else(|| anyhow::anyhow!("block registry required"))?,
+            g2_manager: self
+                .g2_manager
+                .ok_or_else(|| anyhow::anyhow!("g2_manager required"))?,
+            g3_manager: self.g3_manager,
+            workers: self.workers,
+            parallel_worker,
+            cached_worker_metadata: self.cached_worker_metadata,
+            sessions: self.sessions.unwrap_or_else(|| Arc::new(DashMap::new())),
+            session_states: Arc::new(DashMap::new()),
+            remote_leaders: Arc::new(std::sync::RwLock::new(
+                self.remote_leaders.unwrap_or_default(),
+            )),
+            transport,
+            session_sessions: Arc::new(DashMap::new()),
+            object_client: self.object_client,
+        })
+    }
+}
+
+/// Internal session state for holding matched blocks.
+#[allow(dead_code)] // Used for RAII block lifetime management
+struct SessionState {
+    session_id: SessionId,
+    matched_g2_blocks: Vec<ImmutableBlock<G2>>,
+    matched_g3_blocks: Vec<ImmutableBlock<G3>>,
+    status_tx: watch::Sender<OnboardingStatus>,
+}
+
+/// Result of scanning for blocks across tiers.
+///
+/// Unlike `FindMatchesResult`, this scans all given hashes without stopping on first miss.
+/// Returns blocks found in each tier along with their sorted positions.
+pub struct ScanBlocksResult {
+    /// Blocks found in G2 (host memory).
+    pub g2_blocks: HashMap<SequenceHash, ImmutableBlock<G2>>,
+
+    /// Blocks found in G3 (disk).
+    pub g3_blocks: HashMap<SequenceHash, ImmutableBlock<G3>>,
+
+    /// All found blocks sorted by position (lowest to highest).
+    /// Each entry indicates which tier (G2/G3) the block was found in.
+    pub sorted_matches: Vec<(SequenceHash, LogicalLayoutHandle)>,
+}
+
+impl InstanceLeader {
+    /// Get a reference to the G2 BlockManager.
+    pub fn g2_manager(&self) -> &Arc<BlockManager<G2>> {
+        &self.g2_manager
+    }
+
+    /// Get a reference to the optional G3 BlockManager.
+    pub fn g3_manager(&self) -> Option<&Arc<BlockManager<G3>>> {
+        self.g3_manager.as_ref()
+    }
+
+    /// Get the block registry.
+    pub fn registry(&self) -> &BlockRegistry {
+        &self.registry
+    }
+
+    /// Get a reference to the Nova instance.
+    ///
+    /// This provides access to the Nova distributed system for features
+    /// like event coordination and cross-instance communication.
+    pub fn messenger(&self) -> &Arc<Messenger> {
+        &self.messenger
+    }
+
+    /// Get the tokio runtime handle from Nova.
+    ///
+    /// This handle should be used for spawning background tasks that need to
+    /// run on the KVBM runtime's executor (e.g., offload engine pipelines).
+    pub fn runtime(&self) -> tokio::runtime::Handle {
+        self.messenger.runtime().clone()
+    }
+
+    /// Check if a parallel_worker is configured.
+    ///
+    /// The parallel_worker is required for local transfer operations
+    /// (e.g., offloading blocks between tiers).
+    pub fn has_parallel_worker(&self) -> bool {
+        self.parallel_worker.is_some()
+    }
+
+    /// Get the parallel worker for distributed operations.
+    ///
+    /// The parallel worker fans out operations to all workers and aggregates results.
+    /// It implements `ObjectBlockOps` for coordinated object storage uploads.
+    pub fn parallel_worker(&self) -> Option<Arc<dyn ParallelWorkers>> {
+        self.parallel_worker.clone()
+    }
+
+    /// Get the object storage client for G4 operations.
+    ///
+    /// Returns `Some` if object storage is configured, `None` otherwise.
+    /// The client is used by InitiatorSession for G4 parallel search.
+    pub fn object_client(&self) -> Option<Arc<dyn ObjectBlockOps>> {
+        self.object_client.clone()
+    }
+
+    /// Add a remote leader to the search list.
+    ///
+    /// Remote leaders are queried during `find_matches_with_options` when
+    /// `search_remote == true`. This method allows adding remote leaders
+    /// after construction (e.g., when instance IDs are only known after
+    /// cluster setup).
+    pub fn add_remote_leader(&self, instance_id: InstanceId) {
+        let mut remote_leaders = self.remote_leaders.write().unwrap();
+        if !remote_leaders.contains(&instance_id) {
+            remote_leaders.push(instance_id);
+        }
+    }
+
+    /// Set all remote leaders at once.
+    pub fn set_remote_leaders(&self, instance_ids: Vec<InstanceId>) {
+        let mut remote_leaders = self.remote_leaders.write().unwrap();
+        *remote_leaders = instance_ids;
+    }
+
+    /// Get the list of remote leader instance IDs.
+    pub fn remote_leaders(&self) -> Vec<InstanceId> {
+        self.remote_leaders.read().unwrap().clone()
+    }
+
+    /// Scan for all blocks matching any of the given sequence hashes.
+    ///
+    /// Unlike `find_matches`, this:
+    /// - Does NOT stop on first miss
+    /// - Returns blocks from both G2 and G3 tiers separately
+    /// - Acquires blocks from pools (caller owns until dropped via RAII)
+    /// - Returns `sorted_matches` ordered by `SequenceHash::position()`
+    ///
+    /// # Arguments
+    /// * `sequence_hashes` - Hashes to scan for
+    /// * `touch` - Whether to update frequency tracking (for MultiLRU eviction policy)
+    ///
+    /// # Algorithm
+    /// 1. Scan G2 manager for candidates
+    /// 2. Scan G3 manager for remaining candidates
+    /// 3. Build sorted_matches from both, sorted by position (lowest to highest)
+    pub fn scan_blocks(&self, sequence_hashes: &[SequenceHash], touch: bool) -> ScanBlocksResult {
+        // Step 1: Scan G2 for all candidates
+        let g2_blocks = self.g2_manager.scan_matches(sequence_hashes, touch);
+
+        // Step 2: Find remaining hashes not in G2
+        let remaining: Vec<SequenceHash> = sequence_hashes
+            .iter()
+            .filter(|h| !g2_blocks.contains_key(h))
+            .copied()
+            .collect();
+
+        // Step 3: Scan G3 for remaining (if G3 exists)
+        let g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
+            if !remaining.is_empty() {
+                g3_manager.scan_matches(&remaining, touch)
+            } else {
+                HashMap::new()
+            }
+        } else {
+            HashMap::new()
+        };
+
+        // Step 4: Build sorted_matches from both tiers
+        let mut sorted_matches: Vec<(SequenceHash, LogicalLayoutHandle)> =
+            Vec::with_capacity(g2_blocks.len() + g3_blocks.len());
+
+        // Add G2 matches
+        for hash in g2_blocks.keys() {
+            sorted_matches.push((*hash, LogicalLayoutHandle::G2));
+        }
+
+        // Add G3 matches
+        for hash in g3_blocks.keys() {
+            sorted_matches.push((*hash, LogicalLayoutHandle::G3));
+        }
+
+        // Sort by SequenceHash position (lowest to highest)
+        sorted_matches.sort_by_key(|(hash, _)| hash.position());
+
+        ScanBlocksResult {
+            g2_blocks,
+            g3_blocks,
+            sorted_matches,
+        }
+    }
+
+    /// Scan blocks using a custom policy that controls iteration and yields results.
+    ///
+    /// This provides maximum flexibility for implementing custom scanning strategies.
+    /// The policy receives access to a `BlockAccessor` for acquiring blocks and a
+    /// `PolicyContext` for yielding results incrementally.
+    ///
+    /// # Arguments
+    /// * `hashes` - Sequence hashes to scan
+    /// * `touch` - Whether to update frequency tracking on block access
+    /// * `policy` - Function that implements the scanning strategy
+    ///
+    /// # Design
+    ///
+    /// The accessor does NOT hold locks between calls. Each `.find()` call is
+    /// independent. This enables:
+    /// - Custom iteration patterns (sorted, BTree scan, binary search, etc.)
+    /// - Yielding results incrementally (e.g., contiguous subsequences)
+    /// - Future parallel execution (accessor is Send + Sync)
+    ///
+    /// # Example: Simple linear scan
+    /// ```ignore
+    /// let blocks = leader.scan_with_policy(&hashes, true, |hashes, ctx| {
+    ///     for hash in hashes {
+    ///         if let Some(block) = ctx.accessor().find(*hash) {
+    ///             ctx.yield_item(block);
+    ///         }
+    ///     }
+    /// });
+    /// ```
+    ///
+    /// # Example: Find contiguous subsequences
+    /// ```ignore
+    /// let runs: Vec<Vec<TieredBlock>> = leader.scan_with_policy(&hashes, true, |hashes, ctx| {
+    ///     let mut run = Vec::new();
+    ///     let mut last_pos: Option<u64> = None;
+    ///
+    ///     for hash in hashes.iter().sorted_by_key(|h| h.position()) {
+    ///         if let Some(block) = ctx.accessor().find(*hash) {
+    ///             let pos = block.position();
+    ///             if last_pos.map_or(true, |p| pos == p + 1) {
+    ///                 run.push(block);
+    ///             } else {
+    ///                 if !run.is_empty() { ctx.yield_item(std::mem::take(&mut run)); }
+    ///                 run.push(block);
+    ///             }
+    ///             last_pos = Some(pos);
+    ///         } else if !run.is_empty() {
+    ///             ctx.yield_item(std::mem::take(&mut run));
+    ///             last_pos = None;
+    ///         }
+    ///     }
+    ///     if !run.is_empty() { ctx.yield_item(run); }
+    /// });
+    /// ```
+    pub fn scan_with_policy<F, T>(&self, hashes: &[SequenceHash], touch: bool, policy: F) -> Vec<T>
+    where
+        F: FnOnce(&[SequenceHash], &mut PolicyContext<T>),
+    {
+        let accessor = BlockAccessor::new(self, touch);
+        let mut ctx = PolicyContext {
+            accessor,
+            results: Vec::new(),
+        };
+        policy(hashes, &mut ctx);
+        ctx.results
+    }
+
+    pub fn builder() -> InstanceLeaderBuilder {
+        InstanceLeaderBuilder::default()
+    }
+
+    /// Register Nova handlers for leader-to-leader communication.
+    ///
+    /// This must be called after construction to enable distributed onboarding.
+    pub fn register_handlers(&self) -> Result<()> {
+        let instance_id = self.messenger.instance_id();
+        let g2_manager = self.g2_manager.clone();
+        let g3_manager = self.g3_manager.clone();
+        let parallel_worker = self.parallel_worker.clone();
+        let transport = self.transport.clone();
+        let sessions = self.sessions.clone();
+
+        let spawn_responder = move |msg: OnboardMessage| -> Result<()> {
+            if let OnboardMessage::CreateSession {
+                requester,
+                session_id,
+                sequence_hashes,
+            } = msg
+            {
+                let (tx, rx) = mpsc::channel(100);
+                sessions.insert(session_id, tx);
+
+                let session = ResponderSession::new(
+                    session_id,
+                    instance_id,
+                    requester,
+                    g2_manager.clone(),
+                    g3_manager.clone(),
+                    parallel_worker.clone(),
+                    transport.clone(),
+                );
+
+                tokio::spawn(async move {
+                    if let Err(e) = session.run(rx, sequence_hashes).await {
+                        tracing::warn!(error = %e, "ResponderSession error");
+                    }
+                });
+
+                Ok(())
+            } else {
+                anyhow::bail!("spawn_responder called with non-CreateSession message")
+            }
+        };
+
+        // Create export_metadata callback if we have workers or cached metadata
+        let export_metadata_callback: Option<ExportMetadataCallback> =
+            if !self.workers.is_empty() || self.cached_worker_metadata.is_some() {
+                let workers = self.workers.clone();
+                let cached_metadata = self.cached_worker_metadata.clone();
+                Some(Arc::new(move || {
+                    let workers = workers.clone();
+                    let cached_metadata = cached_metadata.clone();
+                    Box::pin(async move {
+                        // Return cached metadata if available
+                        if let Some(cached) = cached_metadata {
+                            return Ok(cached);
+                        }
+                        // Otherwise, query workers
+                        let mut metadata = Vec::with_capacity(workers.len());
+                        for worker in &workers {
+                            let serialized = worker.export_metadata()?.await?;
+                            metadata.push(serialized);
+                        }
+                        Ok(metadata)
+                    })
+                }))
+            } else {
+                None
+            };
+
+        let mut service = VeloLeaderService::new(self.messenger.clone(), self.sessions.clone())
+            .with_spawn_responder(spawn_responder)
+            .with_session_sessions(self.session_sessions.clone());
+
+        if let Some(callback) = export_metadata_callback {
+            service = service.with_export_metadata(callback);
+        }
+
+        service.register_handlers()?;
+
+        Ok(())
+    }
+
+    /// Store session state (held blocks and status channel).
+    ///
+    /// Blocks are kept alive via RAII until the session is removed from storage.
+    fn store_session_state(&self, state: SessionState) {
+        self.session_states.insert(state.session_id, state);
+    }
+
+    /// Release a completed session, dropping any held blocks.
+    ///
+    /// This is optional - sessions will naturally be cleaned up when the InstanceLeader
+    /// is dropped. Call this explicitly if you need to release blocks earlier.
+    pub fn release_session(&self, session_id: SessionId) {
+        self.session_states.remove(&session_id);
+        self.sessions.remove(&session_id);
+        self.session_sessions.remove(&session_id);
+    }
+
+    // ========================================================================
+    // Inverted Control Pattern (Prefill-Decode) Methods
+    // ========================================================================
+
+    /// Create a controllable session for local blocks.
+    ///
+    /// This is the "Decode side" of the inverted control pattern:
+    /// 1. Search local G2 and G3 for matches
+    /// 2. Create a ControllableSession that holds the blocks
+    /// 3. Return session_id to be sent to Prefill out-of-band
+    ///
+    /// By default, G3→G2 staging starts immediately (auto_stage=true).
+    pub fn create_controllable_session(
+        &self,
+        sequence_hashes: &[SequenceHash],
+    ) -> Result<ControllableSessionResult> {
+        self.create_controllable_session_with_options(
+            sequence_hashes,
+            ControllableSessionOptions::default(),
+        )
+    }
+
+    /// Create a controllable session with custom options.
+    ///
+    /// Use this when you need to control auto-staging behavior.
+    pub fn create_controllable_session_with_options(
+        &self,
+        sequence_hashes: &[SequenceHash],
+        options: ControllableSessionOptions,
+    ) -> Result<ControllableSessionResult> {
+        let session_id = SessionId::from(Uuid::new_v4());
+
+        // Local search only
+        let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
+
+        // Find remaining hashes not in G2
+        let remaining_hashes: Vec<_> = sequence_hashes
+            .iter()
+            .filter(|h| !matched_g2_blocks.iter().any(|b| b.sequence_hash() == **h))
+            .copied()
+            .collect();
+
+        // Search G3 for remaining hashes
+        let matched_g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
+            g3_manager.match_blocks(&remaining_hashes)
+        } else {
+            Vec::new()
+        };
+
+        let local_g2_count = matched_g2_blocks.len();
+        let local_g3_count = matched_g3_blocks.len();
+
+        // Create session channel using unified SessionMessage protocol
+        let (tx, rx) = session_message_channel(100);
+        self.session_sessions.insert(session_id, tx);
+
+        // Collect G2 layout handles from workers for round-robin block allocation
+        let worker_g2_handles: Vec<LayoutHandle> = self
+            .parallel_worker
+            .as_ref()
+            .map(|pw| pw.workers().iter().filter_map(|w| w.g2_handle()).collect())
+            .unwrap_or_default();
+
+        let endpoint = super::session::SessionEndpoint::new(
+            session_id,
+            self.messenger.instance_id(),
+            self.transport.clone(),
+            rx,
+        );
+
+        let (cmd_tx, cmd_rx) = mpsc::channel(16);
+
+        let session = ServerSession::new_with_staging(
+            endpoint,
+            BlockHolder::new(matched_g2_blocks),
+            BlockHolder::new(matched_g3_blocks),
+            worker_g2_handles,
+            self.g2_manager.clone(),
+            self.parallel_worker.clone(),
+            cmd_rx,
+            ServerSessionOptions {
+                auto_stage: options.auto_stage,
+            },
+        );
+
+        // Keep handle alive to prevent cmd channel from closing
+        let _handle = ServerSessionHandle::new(session_id, self.messenger.instance_id(), cmd_tx);
+
+        // Spawn session task
+        let session_sessions = self.session_sessions.clone();
+        tokio::spawn(async move {
+            let _handle = _handle; // move handle into task to keep cmd channel open
+            if let Err(e) = session.run().await {
+                tracing::warn!(error = %e, "ServerSession error");
+            }
+            // Clean up when session completes
+            session_sessions.remove(&session_id);
+        });
+
+        Ok(ControllableSessionResult {
+            session_id,
+            local_g2_count,
+            local_g3_count,
+        })
+    }
+
+    // ========================================================================
+    // Unified Session Protocol
+    // ========================================================================
+
+    /// Attach to a remote session.
+    /// Returns a `SessionHandle` that uses `SessionMessage` for communication.
+    ///
+    /// # Arguments
+    /// * `remote_instance` - The instance hosting the session
+    /// * `session_id` - The session to attach to
+    ///
+    /// # Example
+    /// ```ignore
+    /// let handle = leader.attach_session(remote_id, session_id).await?;
+    /// let state = handle.wait_for_ready().await?;
+    /// handle.trigger_staging().await?;
+    /// ```
+    pub async fn attach_session(
+        &self,
+        remote_instance: InstanceId,
+        session_id: SessionId,
+    ) -> Result<SessionHandle> {
+        // Create local channel for receiving state updates
+        let (state_tx, state_rx) = session_handle_state_channel();
+
+        // Register handler for this session's messages
+        let (msg_tx, msg_rx) = session_message_channel(100);
+        self.session_sessions.insert(session_id, msg_tx);
+
+        // Spawn receiver task to update state
+        tokio::spawn(Self::run_session_receiver(msg_rx, state_tx));
+
+        // Send attach message using new protocol
+        let msg = SessionMessage::Attach {
+            peer: self.messenger.instance_id(),
+            session_id,
+            as_role: ControlRole::Controller,
+        };
+        self.transport.send_session(remote_instance, msg).await?;
+
+        let mut handle = SessionHandle::new(
+            session_id,
+            remote_instance,
+            self.messenger.instance_id(),
+            self.transport.clone(),
+            state_rx,
+        );
+
+        // Add RDMA support if parallel worker is configured
+        if let Some(parallel_worker) = &self.parallel_worker {
+            handle = handle.with_rdma_support(parallel_worker.clone());
+        }
+
+        Ok(handle)
+    }
+
+    // ========================================================================
+    // Endpoint Session Creation (Server-Side)
+    // ========================================================================
+
+    /// Create an endpoint session that a remote peer can attach to.
+    ///
+    /// This searches local G2/G3 for blocks matching the given sequence hashes
+    /// and creates a session that exposes them for remote RDMA pull.
+    ///
+    /// Returns `(session_id, handle)` where:
+    /// - `session_id` - Send to remote peer for attachment
+    /// - `handle` - Use to control the session (send layer notifications, close)
+    ///
+    /// # Example
+    /// ```ignore
+    /// // Create session for sequence hashes
+    /// let (session_id, handle) = leader.create_endpoint_session(&hashes)?;
+    ///
+    /// // Send session_id to remote peer out-of-band
+    /// // Remote attaches via: remote_leader.attach_session(local_id, session_id)
+    ///
+    /// // For layerwise transfer, notify when layers are ready
+    /// handle.notify_layers_ready(0..1).await?;
+    /// ```
+    pub fn create_endpoint_session(
+        &self,
+        sequence_hashes: &[SequenceHash],
+    ) -> Result<(SessionId, ServerSessionHandle)> {
+        let session_id = SessionId::from(uuid::Uuid::new_v4());
+
+        // Local search
+        let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
+
+        // Collect layout handles from workers
+        // Note: For single-worker setups, all blocks use the same handle
+        // For multi-worker (SPMD), each block gets the handle from its assigned worker
+        let worker_g2_handles: Vec<LayoutHandle> = self
+            .parallel_worker
+            .as_ref()
+            .map(|pw| pw.workers().iter().filter_map(|w| w.g2_handle()).collect())
+            .unwrap_or_default();
+
+        // Assign layout handle to each matched block
+        // For now, use the first worker's handle for all blocks (single-worker assumption)
+        // TODO: For SPMD, map blocks to worker handles based on block assignment
+        let layout_handle = worker_g2_handles
+            .first()
+            .copied()
+            .ok_or_else(|| anyhow::anyhow!("No G2 layout handle available from workers"))?;
+        let layout_handles: Vec<LayoutHandle> = vec![layout_handle; matched_g2_blocks.len()];
+
+        // Get sequence hashes from matched blocks
+        let matched_hashes: Vec<SequenceHash> = matched_g2_blocks
+            .iter()
+            .map(|b| b.sequence_hash())
+            .collect();
+
+        // Create the session channel
+        let (msg_tx, msg_rx) = session_message_channel(100);
+        self.session_sessions.insert(session_id, msg_tx);
+
+        // Create BlockHolder from matched blocks
+        let block_holder = BlockHolder::new(matched_g2_blocks);
+
+        // Create the session and handle
+        let (session, handle) = create_server_session(
+            session_id,
+            self.messenger.instance_id(),
+            block_holder,
+            layout_handles,
+            matched_hashes,
+            self.transport.clone(),
+            msg_rx,
+        );
+
+        // Spawn the session task
+        let session_sessions = self.session_sessions.clone();
+        tokio::spawn(async move {
+            if let Err(e) = session.run().await {
+                tracing::warn!(error = %e, "ServerSession error");
+            }
+            // Clean up when session completes
+            session_sessions.remove(&session_id);
+        });
+
+        Ok((session_id, handle))
+    }
+
+    /// Create an endpoint session for specific pre-allocated blocks.
+    ///
+    /// Unlike `create_endpoint_session`, this doesn't search - it uses the
+    /// provided blocks directly. Useful when the caller already has blocks
+    /// to expose (e.g., after prefill computation).
+    ///
+    /// # Arguments
+    /// * `blocks` - Blocks to expose for RDMA pull
+    /// * `sequence_hashes` - Sequence hashes for the blocks (must match block count)
+    /// * `layout_handles` - Layout handles for the blocks (must match block count)
+    ///
+    /// # Example
+    /// ```ignore
+    /// // After prefill computation, expose blocks for Decode to pull
+    /// let (session_id, handle) = leader.create_endpoint_session_for_blocks(
+    ///     prefill_blocks,
+    ///     &hashes,
+    ///     &layout_handles,
+    /// )?;
+    /// ```
+    pub fn create_endpoint_session_for_blocks(
+        &self,
+        blocks: BlockHolder<G2>,
+        sequence_hashes: &[SequenceHash],
+        layout_handles: &[LayoutHandle],
+    ) -> Result<(SessionId, ServerSessionHandle)> {
+        let session_id = SessionId::from(uuid::Uuid::new_v4());
+
+        // Create the session channel
+        let (msg_tx, msg_rx) = session_message_channel(100);
+        self.session_sessions.insert(session_id, msg_tx);
+
+        // Create the session and handle
+        let (session, handle) = create_server_session(
+            session_id,
+            self.messenger.instance_id(),
+            blocks,
+            layout_handles.to_vec(),
+            sequence_hashes.to_vec(),
+            self.transport.clone(),
+            msg_rx,
+        );
+
+        // Spawn the session task
+        let session_sessions = self.session_sessions.clone();
+        tokio::spawn(async move {
+            if let Err(e) = session.run().await {
+                tracing::warn!(error = %e, "ServerSession error");
+            }
+            // Clean up when session completes
+            session_sessions.remove(&session_id);
+        });
+
+        Ok((session_id, handle))
+    }
+
+    /// Internal: Process incoming SessionMessage for a session.
+    async fn run_session_receiver(
+        mut rx: mpsc::Receiver<SessionMessage>,
+        state_tx: super::session::SessionHandleStateTx,
+    ) {
+        while let Some(msg) = rx.recv().await {
+            match msg {
+                SessionMessage::StateResponse { state, .. } => {
+                    state_tx.update(state);
+                }
+                SessionMessage::BlocksStaged {
+                    staged_blocks,
+                    remaining,
+                    layer_range,
+                    ..
+                } => {
+                    state_tx.add_staged_blocks(staged_blocks, remaining, layer_range);
+                }
+                SessionMessage::Error { message, .. } => {
+                    tracing::warn!(%message, "Session error");
+                    state_tx.set_failed();
+                    break;
+                }
+                SessionMessage::Close { .. } => {
+                    state_tx.set_phase(SessionPhase::Complete);
+                    break;
+                }
+                _ => {
+                    // Ignore control commands (sent by controller, not received)
+                }
+            }
+        }
+    }
+
+    /// Get the session sessions map (for Nova handler registration).
+    #[expect(dead_code)]
+    pub(crate) fn session_sessions(&self) -> Arc<DashMap<SessionId, SessionMessageTx>> {
+        self.session_sessions.clone()
+    }
+
+    // ========================================================================
+    // RDMA Metadata Management
+    // These methods handle layout metadata export/import for remote RDMA transfers.
+    // ========================================================================
+
+    /// Check if metadata for a remote instance has been loaded.
+    ///
+    /// Returns true if `import_remote_metadata` has been successfully called
+    /// for the given instance.
+    pub fn has_remote_metadata(&self, instance: InstanceId) -> bool {
+        self.parallel_worker
+            .as_ref()
+            .map(|pw| pw.has_remote_metadata(instance))
+            .unwrap_or(false)
+    }
+
+    /// Get the number of workers attached to this leader.
+    pub fn worker_count(&self) -> usize {
+        self.workers.len()
+    }
+
+    /// Export metadata from all workers.
+    ///
+    /// Returns a `Vec<SerializedLayout>` where each element corresponds to a worker
+    /// in rank order. This metadata can be sent to remote instances to enable
+    /// RDMA transfers.
+    ///
+    /// # Returns
+    /// Vector of serialized layouts, one per worker
+    pub async fn export_worker_metadata(&self) -> Result<Vec<SerializedLayout>> {
+        // Return cached metadata if available
+        if let Some(cached) = &self.cached_worker_metadata {
+            return Ok(cached.clone());
+        }
+
+        // Otherwise, query workers
+        let mut metadata = Vec::with_capacity(self.workers.len());
+
+        for worker in &self.workers {
+            let serialized = worker.export_metadata()?.await?;
+            metadata.push(serialized);
+        }
+
+        Ok(metadata)
+    }
+
+    /// Import metadata from a remote instance's workers.
+    ///
+    /// This imports layout metadata from a remote instance, enabling RDMA transfers
+    /// to pull data from that instance. Metadata is imported rank-by-rank:
+    /// - local worker 0 imports remote worker 0's metadata
+    /// - local worker 1 imports remote worker 1's metadata
+    /// - etc.
+    ///
+    /// # Arguments
+    /// * `remote_instance` - The instance ID of the remote leader
+    /// * `metadata` - Vector of SerializedLayout from remote workers (one per worker)
+    ///
+    /// # Errors
+    /// Returns an error if:
+    /// - No parallel worker configured
+    /// - Metadata was already imported for this instance
+    /// - Worker count mismatch between local and remote
+    /// - Individual worker metadata import fails
+    pub async fn import_remote_metadata(
+        &self,
+        remote_instance: InstanceId,
+        metadata: Vec<SerializedLayout>,
+    ) -> Result<()> {
+        let parallel_worker = self
+            .parallel_worker
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
+
+        // Check if already loaded
+        if parallel_worker.has_remote_metadata(remote_instance) {
+            anyhow::bail!("Metadata already imported for instance {}", remote_instance);
+        }
+
+        // Connect to remote - this imports metadata and stores handle mappings
+        parallel_worker
+            .connect_remote(remote_instance, metadata)?
+            .await?;
+
+        Ok(())
+    }
+
+    // ========================================================================
+    // Private Worker Mirror Methods
+    // These methods execute operations across all workers and aggregate results.
+    // ========================================================================
+
+    /// Execute local transfer across all workers, returning aggregated notification.
+    ///
+    /// Delegates to the parallel_worker which fans out to all workers and
+    /// aggregates their notifications into a single composite notification.
+    #[allow(dead_code)]
+    pub(crate) fn execute_local_transfer(
+        &self,
+        src: LogicalLayoutHandle,
+        dst: LogicalLayoutHandle,
+        src_block_ids: Vec<BlockId>,
+        dst_block_ids: Vec<BlockId>,
+        options: TransferOptions,
+    ) -> Result<TransferCompleteNotification> {
+        let parallel_worker = self
+            .parallel_worker
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
+
+        parallel_worker.execute_local_transfer(
+            src,
+            dst,
+            Arc::from(src_block_ids),
+            Arc::from(dst_block_ids),
+            options,
+        )
+    }
+
+    /// Execute remote onboard across all workers, returning aggregated notification.
+    ///
+    /// Delegates to the parallel_worker which fans out to all workers and
+    /// aggregates their notifications into a single composite notification.
+    #[allow(dead_code)]
+    pub(crate) fn execute_remote_onboard(
+        &self,
+        src: RemoteDescriptor,
+        dst: LogicalLayoutHandle,
+        dst_block_ids: Vec<BlockId>,
+        options: TransferOptions,
+    ) -> Result<TransferCompleteNotification> {
+        let parallel_worker = self
+            .parallel_worker
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
+
+        parallel_worker.execute_remote_onboard(src, dst, Arc::from(dst_block_ids), options)
+    }
+
+    /// Execute remote offload across all workers, returning aggregated notification.
+    ///
+    /// Delegates to the parallel_worker which fans out to all workers and
+    /// aggregates their notifications into a single composite notification.
+    #[allow(dead_code)]
+    pub(crate) fn execute_remote_offload(
+        &self,
+        src: LogicalLayoutHandle,
+        dst: RemoteDescriptor,
+        src_block_ids: Vec<BlockId>,
+        options: TransferOptions,
+    ) -> Result<TransferCompleteNotification> {
+        let parallel_worker = self
+            .parallel_worker
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("No parallel worker configured"))?;
+
+        parallel_worker.execute_remote_offload(src, Arc::from(src_block_ids), dst, options)
+    }
+}
+
+impl Leader for InstanceLeader {
+    fn find_matches_with_options(
+        &self,
+        sequence_hashes: &[SequenceHash],
+        options: FindMatchesOptions,
+    ) -> Result<FindMatchesResult> {
+        // Search G2 (host memory) for matches
+        // Uses match_blocks which stops at first miss (implements "first hole" policy).
+        // This ensures we only find contiguous blocks from the start of the sequence.
+        // For distributed search, remote instances use scan_matches for broad coverage,
+        // then first-hole filtering is applied in InitiatorSession after aggregation.
+
+        // todo: add explicit timing tracing here
+        // let start_time = Instant::now();
+        let matched_g2_blocks = self.g2_manager.match_blocks(sequence_hashes);
+        //let g2_search_time = Instant::now().duration_since(start_time);
+
+        // Search G3 (disk) for remaining hashes if G3 is available
+        let remaining_hashes: Vec<_> = sequence_hashes
+            .iter()
+            .filter(|h| !matched_g2_blocks.iter().any(|b| b.sequence_hash() == **h))
+            .copied()
+            .collect();
+
+        let matched_g3_blocks = if let Some(ref g3_manager) = self.g3_manager {
+            // Uses match_blocks on remaining hashes (those not found in G2).
+            // Since G2 already applied first-hole policy, G3 search continues from where G2 stopped.
+            g3_manager.match_blocks(&remaining_hashes)
+        } else {
+            Vec::new()
+        };
+
+        // Determine if we can return immediately (Ready) or need async session
+        // Ready if:
+        //   - g3 blocks is empty
+        //   - AND NOT (search_remote AND has_remote_leaders)
+        //   - AND NOT (search_remote AND has_object_client)
+        //
+        // AsyncSession (is_ready=false) if:
+        //   - g3 is not empty, or
+        //   - search_remote is true AND (has_remote_leaders OR has_object_client)
+        let has_remote_leaders = !self.remote_leaders.read().unwrap().is_empty();
+        let has_object_client = self.object_client.is_some();
+        let needs_remote_search =
+            options.search_remote && (has_remote_leaders || has_object_client);
+        let is_ready = matched_g3_blocks.is_empty() && !needs_remote_search;
+
+        if is_ready {
+            // No session needed - blocks owned directly by ReadyResult (RAII)
+            return Ok(FindMatchesResult::Ready(ReadyResult::new(
+                matched_g2_blocks,
+            )));
+        }
+
+        // AsyncSession path: G3 blocks found or remote search enabled
+        let session_id = SessionId::from(Uuid::new_v4());
+        let local_g2_count = matched_g2_blocks.len();
+        let local_g3_count = matched_g3_blocks.len();
+
+        // AsyncSession: staging locally and/or remote searching
+        let (status_tx, status_rx) = watch::channel(OnboardingStatus::Searching);
+        let all_g2_blocks = Arc::new(Mutex::new(None));
+
+        // Store session state to keep blocks alive
+        let state = SessionState {
+            session_id,
+            matched_g2_blocks,
+            matched_g3_blocks,
+            status_tx: status_tx.clone(),
+        };
+        self.store_session_state(state);
+
+        // If no remote search, handle local-only staging
+        if !options.search_remote {
+            // Local-only staging (Prepare or Full mode)
+            // TODO: Implement local G3→G2 staging
+            let total_matched = local_g2_count + local_g3_count;
+            status_tx
+                .send(OnboardingStatus::Complete {
+                    matched_blocks: total_matched,
+                })
+                .ok();
+
+            return Ok(FindMatchesResult::AsyncSession(AsyncSessionResult::new(
+                session_id,
+                status_rx,
+                all_g2_blocks,
+                None, // No session handle for local-only staging (yet)
+            )));
+        }
+
+        // Remote search path
+        let (tx, rx) = mpsc::channel(100);
+        self.sessions.insert(session_id, tx);
+
+        // Create control channel for Hold/Prepare modes
+        let (session_handle, control_rx) = if matches!(
+            options.staging_mode,
+            StagingMode::Hold | StagingMode::Prepare
+        ) {
+            let (control_tx, control_rx) = mpsc::channel(10);
+            let handle = LegacySessionHandle::new(session_id, options.staging_mode, control_tx);
+            (Some(handle), Some(control_rx))
+        } else {
+            (None, None)
+        };
+
+        let session = InitiatorSession::new(
+            session_id,
+            self.messenger.instance_id(),
+            options.staging_mode,
+            self.g2_manager.clone(),
+            self.g3_manager.clone(),
+            self.parallel_worker.clone(),
+            self.transport.clone(),
+            status_tx.clone(),
+            all_g2_blocks.clone(),
+            control_rx.unwrap_or_else(|| {
+                let (_, rx) = mpsc::channel(1);
+                rx
+            }),
+            self.object_client.clone(),
+        );
+
+        let remote_leaders = self.remote_leaders.read().unwrap().clone();
+        let sequence_hashes = sequence_hashes.to_vec();
+
+        let handle = self.messenger.runtime();
+
+        handle.spawn(async move {
+            if let Err(e) = session.run(rx, remote_leaders, sequence_hashes).await {
+                tracing::warn!(error = %e, "InitiatorSession error");
+                // Try to update status to indicate error
+                status_tx
+                    .send(OnboardingStatus::Complete { matched_blocks: 0 })
+                    .ok();
+            }
+        });
+
+        Ok(FindMatchesResult::AsyncSession(AsyncSessionResult::new(
+            session_id,
+            status_rx,
+            all_g2_blocks,
+            session_handle,
+        )))
+    }
+}