feat: adding kvbm-engine (#6773)

Signed-off-by: Ryan Olson <rolson@nvidia.com>

feat: adding kvbm-engine (#6773)
Signed-off-by: Ryan Olson <rolson@nvidia.com>
008683d6 · Ryan Olson · GitHub · cf79c4fc · 008683d6 · 008683d6
Unverified Commit 008683d6 authored Apr 08, 2026 by Ryan Olson Committed by GitHub Apr 08, 2026
20 changed files
--- a/lib/kvbm-physical/src/transfer/context.rs
+++ b/lib/kvbm-physical/src/transfer/context.rs
@@ -14,7 +14,7 @@ use uuid::Uuid;

 use dynamo_memory::CudaMemPool;
 use dynamo_memory::nixl::{NixlAgent, NixlBackendConfig, XferRequest};
-use velo_events::EventManager;
+use velo::EventManager;

 use crate::manager::TransferManager;


--- a/lib/kvbm-physical/src/transfer/notifications/mod.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/mod.rs
@@ -15,7 +15,7 @@ use tokio::sync::mpsc;
 use tokio::time::interval;
 use tracing::{error, warn};
 use uuid::Uuid;
-use velo_events::{EventHandle, EventManager};
+use velo::{EventHandle, EventManager};

 pub mod cuda_event;
 pub mod nixl_events;

--- a/lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
@@ -12,7 +12,7 @@ use tokio::sync::mpsc;
 use tokio::time::interval;
 use tracing::{error, warn};
 use uuid::Uuid;
-use velo_events::{EventHandle, EventManager};
+use velo::{EventHandle, EventManager};

 /// Registration message for NIXL notification-based transfer completion.
 pub struct RegisterNixlNotification {
@@ -276,7 +276,7 @@ mod tests {
    use std::collections::VecDeque;
    use std::sync::Mutex;
    use tokio::task::yield_now;
-    use velo_events::EventStatus;
+    use velo::EventStatus;

    // ── Mock notification source ────────────────────────────────────


--- a/lib/kvbm-physical/src/transfer/notifications/notification.rs
+++ b/lib/kvbm-physical/src/transfer/notifications/notification.rs
@@ -10,7 +10,7 @@ use std::{
    sync::Arc,
    task::{Context, Poll},
 };
-use velo_events::{Event, EventAwaiter, EventManager};
+use velo::{Event, EventAwaiter, EventManager};

 pub enum TransferAwaiter {
    Local(EventAwaiter),

--- a/lib/kvbm-physical/src/transfer/tests/local_transfers.rs
+++ b/lib/kvbm-physical/src/transfer/tests/local_transfers.rs
@@ -9,7 +9,6 @@
 //! - Different transfer strategies (Memcpy, CUDA H2D/D2H)

 use super::*;
-use crate::transfer::TransferCapabilities;
 use crate::transfer::executor::TransferOptionsInternal;
 use crate::transfer::executor::execute_transfer;
 use crate::transfer::{can_use_whole_block_transfer, validate_layout_compatibility};

--- a/lib/kvbm-physical/src/transfer/tests/mod.rs
+++ b/lib/kvbm-physical/src/transfer/tests/mod.rs
@@ -68,13 +68,8 @@ use crate::{
    },
 };
 use anyhow::Result;
-use cudarc::driver::sys::CUdevice_attribute_enum;
-use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg};
-use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts};
 use std::collections::HashMap;
 use std::ops::Range;
-use std::sync::{Arc, OnceLock};
-use std::time::{Duration, Instant};

 /// Layout kind for parameterized testing.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -404,169 +399,3 @@ pub fn verify_guard_blocks_unchanged(

    Ok(())
 }
-
-/// CUDA sleep kernel source code.
-const SLEEP_KERNEL_SRC: &str = r#"
-extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
-    const unsigned long long start = clock64();
-    while ((clock64() - start) < min_cycles) {
-        asm volatile("");
-    }
-}
-"#;
-
-/// A reusable CUDA sleep utility for tests.
-///
-/// This struct provides a simple interface to execute GPU sleep operations
-/// with calibrated timing. It compiles the sleep kernel once per CUDA context
-/// and caches the calibration for reuse.
-///
-/// The calibration is conservative (prefers longer sleep durations over shorter)
-/// to ensure minimum sleep times are met.
-pub struct CudaSleep {
-    function: cudarc::driver::CudaFunction,
-    cycles_per_ms: f64,
-}
-
-impl CudaSleep {
-    /// Get or create a CudaSleep instance for the given CUDA context.
-    ///
-    /// This function uses lazy initialization and caches instances per device ID.
-    /// The first call for each device will compile the kernel and run calibration.
-    ///
-    /// # Arguments
-    /// * `cuda_ctx` - The CUDA context to use
-    ///
-    /// # Returns
-    /// A shared reference to the CudaSleep instance for this context's device.
-    pub fn for_context(cuda_ctx: &Arc<CudaContext>) -> Result<Arc<Self>> {
-        static INSTANCES: OnceLock<parking_lot::Mutex<HashMap<usize, Arc<CudaSleep>>>> =
-            OnceLock::new();
-
-        let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new()));
-        let device_ordinal = cuda_ctx.ordinal();
-
-        // Fast path: check if instance already exists
-        {
-            let instances_guard = instances.lock();
-            if let Some(instance) = instances_guard.get(&device_ordinal) {
-                return Ok(Arc::clone(instance));
-            }
-        }
-
-        // Slow path: create new instance with calibration
-        let instance = Arc::new(Self::new(cuda_ctx)?);
-
-        // Store in cache
-        let mut instances_guard = instances.lock();
-        instances_guard
-            .entry(device_ordinal)
-            .or_insert_with(|| Arc::clone(&instance));
-
-        Ok(instance)
-    }
-
-    /// Create a new CudaSleep instance with calibration.
-    ///
-    /// This compiles the sleep kernel and runs a calibration loop to determine
-    /// the relationship between clock cycles and wall-clock time.
-    fn new(cuda_ctx: &Arc<CudaContext>) -> Result<Self> {
-        // Get device compute capability
-        let major = cuda_ctx
-            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
-        let minor = cuda_ctx
-            .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
-
-        // Compile PTX for this device
-        let mut compile_opts = CompileOptions {
-            name: Some("sleep_kernel.cu".into()),
-            ..Default::default()
-        };
-        compile_opts
-            .options
-            .push(format!("--gpu-architecture=compute_{}{}", major, minor));
-        let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?;
-        let module = cuda_ctx.load_module(ptx)?;
-        let function = module.load_function("sleep_kernel")?;
-
-        // Get device clock rate
-        let clock_rate_khz =
-            cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64;
-
-        // Create a temporary stream for calibration
-        let stream = cuda_ctx.new_stream()?;
-
-        // Warm up to absorb JIT overhead
-        let warm_cycles = clock_rate_khz.saturating_mul(10).max(1);
-        Self::launch_kernel(&function, &stream, warm_cycles)?;
-        stream.synchronize()?;
-
-        // Run calibration loop
-        let desired_delay = Duration::from_millis(600);
-        let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point
-        let mut actual_duration = Duration::ZERO;
-
-        for _ in 0..8 {
-            let start = Instant::now();
-            Self::launch_kernel(&function, &stream, target_cycles)?;
-            stream.synchronize()?;
-            actual_duration = start.elapsed();
-
-            if actual_duration >= desired_delay {
-                break;
-            }
-
-            target_cycles = target_cycles.saturating_mul(2);
-        }
-
-        // Calculate cycles per millisecond with conservative 20% margin
-        // (prefer longer sleeps over shorter)
-        let cycles_per_ms = if actual_duration.as_millis() > 0 {
-            (target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2
-        } else {
-            clock_rate_khz as f64 // Fallback to clock rate
-        };
-
-        Ok(Self {
-            function,
-            cycles_per_ms,
-        })
-    }
-
-    /// Launch the sleep kernel with the specified number of cycles.
-    fn launch_kernel(
-        function: &cudarc::driver::CudaFunction,
-        stream: &Arc<CudaStream>,
-        cycles: u64,
-    ) -> Result<()> {
-        let launch_cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (1, 1, 1),
-            shared_mem_bytes: 0,
-        };
-
-        let mut launch = stream.launch_builder(function);
-        unsafe {
-            launch.arg(&cycles);
-            launch.launch(launch_cfg)?;
-        }
-
-        Ok(())
-    }
-
-    /// Launch a sleep operation on the given stream.
-    ///
-    /// This queues a GPU kernel that will sleep for approximately the specified
-    /// duration. The sleep is conservative and may take longer than requested.
-    ///
-    /// # Arguments
-    /// * `duration` - The minimum duration to sleep
-    /// * `stream` - The CUDA stream to launch the kernel on
-    ///
-    /// # Returns
-    /// Ok(()) if the kernel was successfully queued
-    pub fn launch(&self, duration: Duration, stream: &Arc<CudaStream>) -> Result<()> {
-        let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64;
-        Self::launch_kernel(&self.function, stream, target_cycles)
-    }
-}
--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -48,6 +48,7 @@ required-features = ["kv-router-stress"]

 [dependencies]
 # repo
+dynamo-config = { workspace = true }
 dynamo-kv-router = { workspace = true, features = ["metrics", "runtime-protocols"] }
 dynamo-memory = { workspace = true }
 dynamo-mocker = { workspace = true }
@@ -64,6 +65,7 @@ async-trait = { workspace = true }
 async-nats = { workspace = true }
 bytes = { workspace = true }
 chrono = { workspace = true }
+dashmap = { workspace = true }
 derive_builder = { workspace = true }
 either = { workspace = true }
 futures = { workspace = true }
@@ -89,14 +91,14 @@ uuid = { workspace = true }
 xxhash-rust = { workspace = true }
 modelexpress-client = { workspace = true }
 modelexpress-common = { workspace = true }
+
+
 bitflags = { version = "2.4", features = ["serde"] }
 blake3 = { version = "1.8", features = ["mmap", "rayon"] }
 bytemuck = "1.22"
-# candle-core = { version = "0.9.1" }
 derive-getters = "0.5"
 offset-allocator = "0.2"
 rayon = "1"
-dashmap = { version = "5.5.3" }
 bincode = { version = "2.0.1", features = ["serde", "derive"] }

 # lora

--- a/lib/llm/src/block_manager/controller.rs
+++ b/lib/llm/src/block_manager/controller.rs
@@ -98,7 +98,8 @@ pub enum ResetResponse {
 mod tests {
    use crate::tokens::Tokens;

-    use super::super::tests::create_reference_block_manager_with_counts;
+    use super::super::ReferenceBlockManager;
+    use super::super::tests::create_reference_block_manager_config_with_counts;
    use super::*;

    #[tokio::test]
@@ -110,9 +111,11 @@ mod tests {
            .await
            .unwrap();

-        let worker_id = drt.connection_id();
+        let worker_id = drt.connection_id() as i64;

-        let block_manager = create_reference_block_manager_with_counts(8, 16, 0).await;
+        let config = create_reference_block_manager_config_with_counts(8, 16, 0);
+        let block_manager: ReferenceBlockManager =
+            ReferenceBlockManager::new(config).await.unwrap();

        let component = drt
            .namespace("test-kvbm")

--- a/lib/llm/src/block_manager/distributed.rs
+++ b/lib/llm/src/block_manager/distributed.rs
@@ -60,10 +60,7 @@ mod tests {
    use anyhow::Result;
    use rstest::*;

-    use std::sync::{
-        Arc,
-        atomic::{AtomicUsize, Ordering},
-    };
+    use std::sync::Arc;
    use tokio_util::sync::CancellationToken;

    use dynamo_runtime::logging::init as init_logging;

--- a/lib/llm/src/block_manager/numa_allocator.rs
+++ b/lib/llm/src/block_manager/numa_allocator.rs
@@ -21,13 +21,10 @@ pub use dynamo_memory::numa::*;
 )]
 pub fn is_numa_enabled() -> bool {
    // Global kill switch always wins
-    if is_numa_disabled() {
+    if dynamo_memory::numa::is_numa_disabled() {
        return false;
    }
-    matches!(
-        std::env::var("DYN_KVBM_ENABLE_NUMA").as_deref(),
-        Ok("1" | "true" | "yes")
-    )
+    dynamo_config::env_is_truthy("DYN_KVBM_ENABLE_NUMA")
 }

 #[cfg(test)]

--- a/lib/llm/src/cuda.rs
+++ b/lib/llm/src/cuda.rs
@@ -84,7 +84,7 @@ impl DynamoCudaContextGuard {
    /// The caller must ensure the context is valid.
    pub unsafe fn new(context: CUcontext) -> Pin<Box<Self>> {
        // Push the context onto the CUDA context stack
-        let result = cuCtxPushCurrent_v2(context);
+        let result = unsafe { cuCtxPushCurrent_v2(context) };
        if result != cudaError_enum::CUDA_SUCCESS {
            panic!("Failed to push CUDA context: {:?}", result);
        }
@@ -192,7 +192,7 @@ impl DynamoCudaContextProvider for CudaContext {

 impl DynamoCudaContextProvider for CudaStream {
    unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
-        self.context().cu_context()
+        unsafe { self.context().cu_context() }
    }
 }


--- a/lib/memory/.claude/settings.local.json
+++ b/lib/memory/.claude/settings.local.json
+{
+  "permissions": {
+    "allow": [
+      "Bash(ls:*)",
+      "Bash(rustc --print sysroot:*)",
+      "Bash(cargo tree:*)",
+      "Bash(cargo metadata:*)",
+      "Bash(env:*)",
+      "Bash(ldd:*)",
+      "Bash(cargo test:*)"
+    ]
+  }
+}
--- a/lib/memory/Cargo.toml
+++ b/lib/memory/Cargo.toml
@@ -31,6 +31,7 @@ thiserror = { workspace = true }
 tracing = { workspace = true }

 libc = { version = "0.2" }
+libloading = "0.8"
 nix = { version = "0.30", features = ["fs"] }
 offset-allocator = "0.2"


--- a/lib/memory/bin/validate_numa_placement.rs
+++ b/lib/memory/bin/validate_numa_placement.rs
@@ -142,7 +142,7 @@ fn main() {
    println!("=======================");
    println!("GPUs:            {gpus:?}");
    println!("Alloc size:      {size_mib} MiB ({alloc_size} bytes)");
-    println!("NUMA disabled:   {}", dynamo_memory::is_numa_disabled());
+    println!("NUMA enabled:    {}", dynamo_memory::is_numa_enabled());
    println!();

    // Phase 1: Show GPU-to-NUMA mapping

--- a/lib/memory/src/lib.rs
+++ b/lib/memory/src/lib.rs
@@ -43,7 +43,7 @@ pub use device::DeviceStorage;
 pub use disk::DiskStorage;
 pub use external::ExternalDeviceMemory;
 #[cfg(target_os = "linux")]
-pub use numa::{NumaNode, is_numa_disabled};
+pub use numa::{NumaNode, is_numa_disabled, is_numa_enabled};
 pub use offset::OffsetBuffer;
 pub use pinned::PinnedStorage;
 pub use pool::{CudaMemPool, CudaMemPoolBuilder};

--- a/lib/memory/src/numa/mod.rs
+++ b/lib/memory/src/numa/mod.rs
@@ -25,10 +25,11 @@
 //! places pages on the correct node. If the GPU's NUMA node cannot be
 //! determined, allocation falls back to the non-NUMA path transparently.

+pub(crate) mod nvml;
 pub mod topology;
 pub mod worker_pool;

-use cudarc::driver::sys::CUdevice_attribute_enum;
+use cudarc::driver::{result::device as cuda_device, sys as cuda_sys};
 use nix::libc;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
@@ -44,8 +45,13 @@ static NUMA_NODE_CACHE: OnceLock<Mutex<HashMap<String, Option<NumaNode>>>> = Onc
 ///
 /// NUMA-aware allocation is enabled by default. Set `DYN_MEMORY_DISABLE_NUMA=1`
 /// (or any truthy value) to disable it.
+pub fn is_numa_enabled() -> bool {
+    !crate::env_is_truthy("DYN_MEMORY_DISABLE_NUMA")
+}
+
+/// Convenience inverse of [`is_numa_enabled`].
 pub fn is_numa_disabled() -> bool {
-    crate::env_is_truthy("DYN_MEMORY_DISABLE_NUMA")
+    !is_numa_enabled()
 }

 /// Represents a NUMA node identifier.
@@ -99,32 +105,6 @@ pub fn get_current_cpu_numa_node() -> NumaNode {
    }
 }

-/// Format a PCI bus address from domain, bus, and device IDs.
-///
-/// Returns a string in the format `"DDDD:BB:DD.0"` suitable for sysfs lookups.
-fn format_pci_bus_address(domain: i32, bus: i32, device: i32) -> String {
-    format!("{:04x}:{:02x}:{:02x}.0", domain, bus, device)
-}
-
-/// Query the PCI bus address for a CUDA device from the CUDA driver API.
-///
-/// Uses `CudaContext::attribute()` to read PCI domain, bus, and device IDs.
-/// This transparently handles `CUDA_VISIBLE_DEVICES` remapping since
-/// `CudaContext::new(ordinal)` operates on the process-local device index.
-fn get_pci_bus_address_from_cuda(device_id: u32) -> Option<String> {
-    let ctx = crate::device::cuda_context(device_id).ok()?;
-    let domain = ctx
-        .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID)
-        .ok()?;
-    let bus = ctx
-        .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)
-        .ok()?;
-    let device = ctx
-        .attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
-        .ok()?;
-    Some(format_pci_bus_address(domain, bus, device))
-}
-
 /// Read the NUMA node for a PCI device from sysfs.
 ///
 /// Reads `/sys/bus/pci/devices/<pci_address>/numa_node`. Returns `None` if the
@@ -274,6 +254,210 @@ pub fn pin_thread_to_numa_node(node: NumaNode) -> Result<(), String> {
    Ok(())
 }

+/// Get PCI bus address for a CUDA device via the CUDA driver API.
+///
+/// Returns a normalized PCI address string like "0000:3b:00.0".
+/// The device_id here is a CUDA ordinal (affected by CUDA_VISIBLE_DEVICES).
+fn get_pci_bus_address_from_cuda(device_id: u32) -> Option<String> {
+    // SAFETY: We're calling CUDA driver API functions with valid device ordinals.
+    // cuDeviceGet and get_attribute are safe as long as CUDA is initialized
+    // (which CudaContext::new handles).
+    unsafe {
+        let mut dev = std::mem::MaybeUninit::uninit();
+        if cuda_sys::cuDeviceGet(dev.as_mut_ptr(), device_id as i32)
+            .result()
+            .is_err()
+        {
+            return None;
+        }
+        let dev = dev.assume_init();
+
+        let domain = cuda_device::get_attribute(
+            dev,
+            cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+        )
+        .ok()?;
+        let bus = cuda_device::get_attribute(
+            dev,
+            cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+        )
+        .ok()?;
+        let device = cuda_device::get_attribute(
+            dev,
+            cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+        )
+        .ok()?;
+
+        Some(format!("{:04x}:{:02x}:{:02x}.0", domain, bus, device))
+    }
+}
+
+/// GPU info with PCI address and NUMA node, used for CPU set subdivision.
+#[derive(Debug, Clone)]
+struct GpuTopoInfo {
+    pci_address: String,
+    numa_node: Option<u32>,
+}
+
+/// Enumerate all GPUs visible to CUDA with their PCI addresses and NUMA nodes.
+fn enumerate_cuda_gpus() -> Vec<GpuTopoInfo> {
+    let count = match cuda_device::get_count() {
+        Ok(c) => c,
+        Err(_) => return Vec::new(),
+    };
+
+    (0..count as u32)
+        .filter_map(|i| {
+            let pci = get_pci_bus_address_from_cuda(i)?;
+            let numa = read_numa_node_from_sysfs(&pci).map(|n| n.0);
+            Some(GpuTopoInfo {
+                pci_address: pci,
+                numa_node: numa,
+            })
+        })
+        .collect()
+}
+
+/// Enumerate all GPUs on the system, preferring NVML (sees all GPUs)
+/// over CUDA driver (only sees CUDA_VISIBLE_DEVICES).
+fn enumerate_all_gpus() -> Vec<GpuTopoInfo> {
+    // Try NVML first — it sees all GPUs regardless of CUDA_VISIBLE_DEVICES
+    if let Some(nvml) = nvml::try_nvml() {
+        let nvml_gpus = nvml.enumerate_gpus();
+        if !nvml_gpus.is_empty() {
+            tracing::debug!(
+                "NVML enumerated {} GPUs (ignoring CUDA_VISIBLE_DEVICES)",
+                nvml_gpus.len()
+            );
+            return nvml_gpus
+                .into_iter()
+                .map(|g| {
+                    let numa = read_numa_node_from_sysfs(&g.pci_address).map(|n| n.0);
+                    GpuTopoInfo {
+                        pci_address: g.pci_address,
+                        numa_node: numa,
+                    }
+                })
+                .collect();
+        }
+    }
+
+    // Fallback: enumerate via CUDA driver (may miss hidden devices)
+    tracing::debug!("Falling back to CUDA driver GPU enumeration");
+    enumerate_cuda_gpus()
+}
+
+/// Cached CPU set results per CUDA device ordinal.
+static DEVICE_CPU_SETS: OnceLock<HashMap<u32, Option<Vec<usize>>>> = OnceLock::new();
+
+/// Get a deterministic CPU subset for a CUDA device, subdivided among ALL GPUs
+/// sharing the same NUMA node (including those hidden by CUDA_VISIBLE_DEVICES).
+///
+/// # Algorithm
+/// 1. Get PCI address + NUMA node for target device (CUDA driver API)
+/// 2. Enumerate ALL GPUs on the system:
+///    - Try NVML first (sees all GPUs, ignores CUDA_VISIBLE_DEVICES)
+///    - Fall back to CUDA driver API (only sees visible devices)
+/// 3. For each GPU, get its NUMA node via sysfs (PCI address → /sys/.../numa_node)
+/// 4. Group GPUs by NUMA node
+/// 5. Sort by PCI address within each group (deterministic)
+/// 6. Get full CPU set for the node via topology
+/// 7. Divide into N equal slices (N = GPUs on same node)
+/// 8. Return the slice for the target device's position
+///
+/// # Example
+/// System: 8 GPUs, 2 NUMA nodes, 4 GPUs per node.
+/// CUDA_VISIBLE_DEVICES=0,1 (only 2 visible).
+/// NVML sees all 8 → correctly subdivides into 4 slices per node.
+///
+/// Returns None if NUMA node can't be determined.
+pub fn get_device_cpu_set(device_id: u32) -> Option<Vec<usize>> {
+    DEVICE_CPU_SETS
+        .get_or_init(compute_all_device_cpu_sets)
+        .get(&device_id)
+        .cloned()
+        .flatten()
+}
+
+fn compute_all_device_cpu_sets() -> HashMap<u32, Option<Vec<usize>>> {
+    let topology = match topology::get_numa_topology() {
+        Ok(t) => t,
+        Err(e) => {
+            tracing::warn!("Cannot subdivide CPU sets: {e}");
+            return HashMap::new();
+        }
+    };
+
+    // Get the target device's PCI address and NUMA node
+    let cuda_count = cuda_device::get_count().unwrap_or(0);
+    if cuda_count == 0 {
+        return HashMap::new();
+    }
+
+    // Build info for each visible CUDA device
+    let mut cuda_devices: Vec<(u32, String, Option<u32>)> = Vec::new();
+    for i in 0..cuda_count as u32 {
+        if let Some(pci) = get_pci_bus_address_from_cuda(i) {
+            let numa = read_numa_node_from_sysfs(&pci).map(|n| n.0);
+            cuda_devices.push((i, pci, numa));
+        }
+    }
+
+    // Enumerate ALL GPUs on the system (NVML preferred)
+    let all_gpus = enumerate_all_gpus();
+
+    // Group all GPUs by NUMA node
+    let mut node_groups: HashMap<u32, Vec<String>> = HashMap::new();
+    for gpu in &all_gpus {
+        if let Some(node) = gpu.numa_node {
+            node_groups
+                .entry(node)
+                .or_default()
+                .push(gpu.pci_address.clone());
+        }
+    }
+
+    // Sort each group by PCI address for deterministic ordering
+    for group in node_groups.values_mut() {
+        group.sort();
+    }
+
+    // For each CUDA device, find its position in its NUMA group and subdivide
+    let mut results = HashMap::new();
+    for (device_id, pci_addr, numa_node) in &cuda_devices {
+        let cpu_set = numa_node.and_then(|node| {
+            let group = node_groups.get(&node)?;
+            let position = group.iter().position(|addr| addr == pci_addr)?;
+            let all_cpus = topology.cpus_for_node(node)?;
+
+            if all_cpus.is_empty() || group.is_empty() {
+                return None;
+            }
+
+            // Divide CPUs into N equal slices
+            let n = group.len();
+            let chunk_size = all_cpus.len() / n;
+            if chunk_size == 0 {
+                // More GPUs than CPUs on this node — give all CPUs to everyone
+                return Some(all_cpus.to_vec());
+            }
+
+            let start = position * chunk_size;
+            let end = if position == n - 1 {
+                all_cpus.len() // last slice gets remainder
+            } else {
+                start + chunk_size
+            };
+
+            Some(all_cpus[start..end].to_vec())
+        });
+
+        results.insert(*device_id, cpu_set);
+    }
+
+    results
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -345,14 +529,6 @@ mod tests {
        assert_eq!(node2, node3);
    }

-    #[test]
-    fn test_format_pci_bus_address() {
-        assert_eq!(format_pci_bus_address(0, 0, 0), "0000:00:00.0");
-        assert_eq!(format_pci_bus_address(0, 0x3b, 0), "0000:3b:00.0");
-        assert_eq!(format_pci_bus_address(0, 0xaf, 0), "0000:af:00.0");
-        assert_eq!(format_pci_bus_address(0x10, 0x1a, 0x03), "0010:1a:03.0");
-    }
-
    #[test]
    fn test_read_numa_node_from_sysfs_nonexistent() {
        assert!(read_numa_node_from_sysfs("ffff:ff:ff.0").is_none());

--- a/lib/memory/src/numa/nvml.rs
+++ b/lib/memory/src/numa/nvml.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Minimal NVML FFI via dlopen.
+//!
+//! Dynamically loads `libnvidia-ml.so.1` to enumerate ALL GPUs on the system,
+//! regardless of `CUDA_VISIBLE_DEVICES`. This is critical for CPU set subdivision:
+//! when multiple GPUs share a NUMA node, we need to know about ALL siblings to
+//! divide CPU cores fairly.
+//!
+//! If NVML is unavailable (e.g., in containers without the management library),
+//! callers fall back to CUDA driver enumeration (which only sees visible devices).
+
+use libloading::{Library, Symbol};
+use std::sync::OnceLock;
+
+/// PCI info struct matching NVML's `nvmlPciInfo_t`.
+#[repr(C)]
+struct NvmlPciInfo {
+    bus_id_legacy: [u8; 16], // "DDDD:BB:DD.F\0" (legacy, 16 chars)
+    domain: u32,
+    bus: u32,
+    device: u32,
+    pci_device_id: u32,
+    pci_subsystem_id: u32,
+    bus_id: [u8; 32], // "DDDD:BB:DD.F\0" (full, 32 chars)
+}
+
+/// GPU info from NVML enumeration.
+#[derive(Debug, Clone)]
+pub(crate) struct NvmlGpuInfo {
+    /// PCI bus address, e.g. "0000:3b:00.0"
+    pub pci_address: String,
+}
+
+// NVML return codes
+const NVML_SUCCESS: u32 = 0;
+
+/// Minimal NVML handle. Sees ALL GPUs regardless of CUDA_VISIBLE_DEVICES.
+pub(crate) struct NvmlHandle {
+    _lib: Library,
+    device_get_count: unsafe extern "C" fn(*mut u32) -> u32,
+    device_get_handle_by_index: unsafe extern "C" fn(u32, *mut u64) -> u32,
+    device_get_pci_info: unsafe extern "C" fn(u64, *mut NvmlPciInfo) -> u32,
+    shutdown: unsafe extern "C" fn() -> u32,
+}
+
+// SAFETY: NVML functions are thread-safe per NVML documentation
+unsafe impl Send for NvmlHandle {}
+unsafe impl Sync for NvmlHandle {}
+
+impl NvmlHandle {
+    /// Try to load NVML. Returns None if libnvidia-ml.so.1 is not available.
+    pub fn try_load() -> Option<Self> {
+        // SAFETY: We are loading a well-known system library and resolving documented
+        // NVML API symbols. The library is kept alive for the lifetime of NvmlHandle.
+        unsafe {
+            let lib = Library::new("libnvidia-ml.so.1").ok()?;
+
+            // Initialize NVML
+            let init: Symbol<unsafe extern "C" fn() -> u32> = lib.get(b"nvmlInit_v2\0").ok()?;
+            if init() != NVML_SUCCESS {
+                tracing::warn!("nvmlInit_v2 failed");
+                return None;
+            }
+
+            let device_get_count: Symbol<unsafe extern "C" fn(*mut u32) -> u32> =
+                lib.get(b"nvmlDeviceGetCount_v2\0").ok()?;
+            let device_get_handle_by_index: Symbol<unsafe extern "C" fn(u32, *mut u64) -> u32> =
+                lib.get(b"nvmlDeviceGetHandleByIndex_v2\0").ok()?;
+            let device_get_pci_info: Symbol<unsafe extern "C" fn(u64, *mut NvmlPciInfo) -> u32> =
+                lib.get(b"nvmlDeviceGetPciInfo_v3\0").ok()?;
+            let shutdown: Symbol<unsafe extern "C" fn() -> u32> =
+                lib.get(b"nvmlShutdown\0").ok()?;
+
+            Some(Self {
+                device_get_count: *device_get_count,
+                device_get_handle_by_index: *device_get_handle_by_index,
+                device_get_pci_info: *device_get_pci_info,
+                shutdown: *shutdown,
+                _lib: lib,
+            })
+        }
+    }
+
+    /// Enumerate ALL GPUs on the system with PCI addresses.
+    pub fn enumerate_gpus(&self) -> Vec<NvmlGpuInfo> {
+        let mut count: u32 = 0;
+        // SAFETY: NVML API call with valid pointer
+        unsafe {
+            if (self.device_get_count)(&mut count) != NVML_SUCCESS {
+                tracing::warn!("nvmlDeviceGetCount_v2 failed");
+                return Vec::new();
+            }
+        }
+
+        let mut gpus = Vec::with_capacity(count as usize);
+        for i in 0..count {
+            let mut handle: u64 = 0;
+            // SAFETY: NVML API call with valid index and pointer
+            unsafe {
+                if (self.device_get_handle_by_index)(i, &mut handle) != NVML_SUCCESS {
+                    tracing::warn!("nvmlDeviceGetHandleByIndex_v2 failed for index {i}");
+                    continue;
+                }
+            }
+
+            let mut pci_info = std::mem::MaybeUninit::<NvmlPciInfo>::zeroed();
+            // SAFETY: NVML API call with valid handle and pointer to zeroed struct
+            unsafe {
+                if (self.device_get_pci_info)(handle, pci_info.as_mut_ptr()) != NVML_SUCCESS {
+                    tracing::warn!("nvmlDeviceGetPciInfo_v3 failed for index {i}");
+                    continue;
+                }
+                let pci_info = pci_info.assume_init();
+
+                // Parse bus_id field: "DDDD:BB:DD.F\0" padded with zeros
+                let bus_id = &pci_info.bus_id;
+                let len = bus_id.iter().position(|&b| b == 0).unwrap_or(bus_id.len());
+                let pci_address = std::str::from_utf8(&bus_id[..len])
+                    .unwrap_or("")
+                    .to_lowercase();
+
+                if !pci_address.is_empty() {
+                    gpus.push(NvmlGpuInfo { pci_address });
+                }
+            }
+        }
+
+        gpus
+    }
+}
+
+impl Drop for NvmlHandle {
+    fn drop(&mut self) {
+        // SAFETY: Matching nvmlInit_v2 call from try_load
+        unsafe {
+            (self.shutdown)();
+        }
+    }
+}
+
+/// Cached NVML load attempt (None = tried and failed).
+static NVML: OnceLock<Option<NvmlHandle>> = OnceLock::new();
+
+/// Get a reference to the global NVML handle, if available.
+pub(crate) fn try_nvml() -> Option<&'static NvmlHandle> {
+    NVML.get_or_init(|| {
+        let handle = NvmlHandle::try_load();
+        if handle.is_some() {
+            tracing::debug!("NVML loaded successfully");
+        } else {
+            tracing::debug!("NVML not available, will fall back to CUDA driver enumeration");
+        }
+        handle
+    })
+    .as_ref()
+}
--- a/lib/memory/src/pinned.rs
+++ b/lib/memory/src/pinned.rs
@@ -127,7 +127,7 @@ impl PinnedStorage {
        // Try NUMA-aware allocation unless explicitly disabled
        #[cfg(target_os = "linux")]
        let numa_ptr = if let Some(gpu_id) = device_id {
-            if !super::numa::is_numa_disabled() {
+            if super::numa::is_numa_enabled() {
                match super::numa::worker_pool::NumaWorkerPool::global()
                    .allocate_pinned_for_gpu(len, gpu_id)
                {

--- a/lib/runtime/examples/Cargo.lock
+++ b/lib/runtime/examples/Cargo.lock
@@ -47,9 +47,9 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"

 [[package]]
 name = "arc-swap"
-version = "1.9.0"
+version = "1.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6"
+checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
 dependencies = [
 "rustversion",
 ]
@@ -979,9 +979,9 @@ dependencies = [

 [[package]]
 name = "fastrand"
-version = "2.3.0"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
+checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"

 [[package]]
 name = "fiat-crypto"

--- a/lib/runtime/src/logging.rs
+++ b/lib/runtime/src/logging.rs
@@ -955,7 +955,7 @@ pub fn init() {
 }

 #[cfg(feature = "tokio-console")]
-fn setup_logging() {
+fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
    let tokio_console_layer = console_subscriber::ConsoleLayer::builder()
        .with_default_env()
        .server_addr(([0, 0, 0, 0], console_subscriber::Server::DEFAULT_PORT))
@@ -973,6 +973,7 @@ fn setup_logging() {
        .with(l)
        .with(tokio_console_layer.with_filter(tokio_console_target))
        .init();
+    Ok(())
 }

 #[cfg(not(feature = "tokio-console"))]