fix: fixing the NUMA sensitivity problem in KVBM for TP=1 (#3700)

Signed-off-by: Olga Andreeva <oandreeva@nvidia.com>

fix: fixing the NUMA sensitivity problem in KVBM for TP=1 (#3700)
Signed-off-by: Olga Andreeva <oandreeva@nvidia.com>
67d27bcc · Olga Andreeva · GitHub · f6ed01b1 · 67d27bcc · 67d27bcc
Unverified Commit 67d27bcc authored Oct 20, 2025 by Olga Andreeva Committed by GitHub Oct 20, 2025
5 changed files
--- a/lib/llm/src/block_manager.rs
+++ b/lib/llm/src/block_manager.rs
@@ -16,6 +16,7 @@ pub mod distributed;
 pub mod events;
 pub mod layout;
 pub mod metrics_kvbm;
+pub mod numa_allocator;
 pub mod offload;
 pub mod pool;
 pub mod storage;

--- a/lib/llm/src/block_manager/numa_allocator.rs
+++ b/lib/llm/src/block_manager/numa_allocator.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+pub mod topology;
+pub mod worker_pool;
+use nix::libc;
+use serde::{Deserialize, Serialize};
+use std::{mem, process::Command};
+/// Check if NUMA optimization is enabled via environment variable
+///
+/// Set `DYN_KVBM_ENABLE_NUMA=1` to enable NUMA-aware allocation.
+/// Default: disabled (opt-in)
+pub fn is_numa_enabled() -> bool {
+    std::env::var("DYN_KVBM_ENABLE_NUMA")
+        .map(|v| v == "1" || v.to_lowercase() == "true")
+        .unwrap_or(false)
+}
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct NumaNode(pub u32);
+impl NumaNode {
+    pub const UNKNOWN: NumaNode = NumaNode(u32::MAX);
+    pub fn is_unknown(&self) -> bool {
+        self.0 == u32::MAX
+    }
+}
+impl std::fmt::Display for NumaNode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_unknown() {
+            write!(f, "UNKNOWN")
+        } else {
+            write!(f, "NumaNode({})", self.0)
+        }
+    }
+}
+/// Get the current CPU's NUMA node
+///
+/// Uses the Linux `getcpu` syscall to determine which NUMA node the current CPU belongs to.
+/// Returns `NumaNode::UNKNOWN` if the syscall fails.
+pub fn get_current_cpu_numa_node() -> NumaNode {
+    unsafe {
+        let mut cpu: libc::c_uint = 0;
+        let mut node: libc::c_uint = 0;
+        // getcpu syscall: int getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache);
+        let result = libc::syscall(
+            libc::SYS_getcpu,
+            &mut cpu,
+            &mut node,
+            std::ptr::null_mut::<libc::c_void>(),
+        );
+        if result == 0 {
+            NumaNode(node)
+        } else {
+            NumaNode::UNKNOWN
+        }
+    }
+}
+/// Get NUMA node for device (GPU) memory
+///
+/// For GPU memory, the NUMA affinity depends on which PCIe bus the GPU is attached to.
+/// This can be queried via nvidia-smi.
+pub fn get_device_numa_node(device_id: u32) -> NumaNode {
+    // Use nvidia-smi topo to get NUMA ID of nearest CPU
+    // This directly returns the NUMA node
+    let output = match Command::new("nvidia-smi")
+        .args([
+            "topo",
+            "--get-numa-id-of-nearby-cpu",
+            "-i",
+            &device_id.to_string(),
+        ])
+        .output()
+    {
+        Ok(out) if out.status.success() => out,
+        _ => {
+            tracing::warn!("nvidia-smi failed for GPU {}, using heuristic", device_id);
+            return NumaNode(device_id % 2);
+        }
+    };
+    if let Ok(stdout) = std::str::from_utf8(&output.stdout)
+        && let Some(line) = stdout.lines().next()
+        && let Some(numa_str) = line.split(':').nth(1)
+        && let Ok(node) = numa_str.trim().parse::<u32>()
+    {
+        tracing::trace!("GPU {} on NUMA node {}", device_id, node);
+        return NumaNode(node);
+    }
+    tracing::warn!("Failed to get NUMA node for GPU {}", device_id);
+    NumaNode::UNKNOWN
+}
+/// Pin the current thread to a specific NUMA node's CPUs
+///
+/// This sets the CPU affinity for the calling thread to only run on CPUs
+/// belonging to the specified NUMA node. This is critical for ensuring
+/// that memory allocations follow the first-touch policy on the correct node.
+pub fn pin_thread_to_numa_node(node: NumaNode) -> Result<(), String> {
+    let topology =
+        topology::get_numa_topology().map_err(|e| format!("Can not get NUMA topology: {}", e))?;
+    let cpus = topology
+        .cpus_for_node(node.0)
+        .ok_or_else(|| format!("No CPUs found for NUMA node {}", node.0))?;
+    if cpus.is_empty() {
+        return Err(format!("No CPUs found for NUMA node {}", node.0));
+    }
+    unsafe {
+        let mut cpu_set: libc::cpu_set_t = mem::zeroed();
+        for cpu in cpus {
+            libc::CPU_SET(*cpu, &mut cpu_set);
+        }
+        let result = libc::sched_setaffinity(
+            0, // current thread
+            mem::size_of::<libc::cpu_set_t>(),
+            &cpu_set,
+        );
+        if result != 0 {
+            let err = std::io::Error::last_os_error();
+            return Err(format!("Failed to set CPU affinity: {}", err));
+        }
+    }
+    Ok(())
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_numa_node_equality() {
+        let node0a = NumaNode(0);
+        let node0b = NumaNode(0);
+        let node1 = NumaNode(1);
+        assert_eq!(node0a, node0b);
+        assert_ne!(node0a, node1);
+    }
+    #[test]
+    fn test_numa_node_unknown() {
+        let unknown = NumaNode::UNKNOWN;
+        assert!(unknown.is_unknown());
+        assert_eq!(unknown.0, u32::MAX);
+        let valid = NumaNode(0);
+        assert!(!valid.is_unknown());
+    }
+    #[test]
+    fn test_numa_node_display() {
+        assert_eq!(format!("{}", NumaNode(0)), "NumaNode(0)");
+        assert_eq!(format!("{}", NumaNode(7)), "NumaNode(7)");
+        assert_eq!(format!("{}", NumaNode::UNKNOWN), "UNKNOWN");
+    }
+    #[test]
+    fn test_numa_node_serialization() {
+        // Verify NumaNode can be serialized (important for benchmarking)
+        let node = NumaNode(1);
+        let json = serde_json::to_string(&node).unwrap();
+        let deserialized: NumaNode = serde_json::from_str(&json).unwrap();
+        assert_eq!(node, deserialized);
+    }
+    #[test]
+    fn test_get_current_cpu_numa_node() {
+        // Should either return a valid node or UNKNOWN
+        let node = get_current_cpu_numa_node();
+        // If not unknown, should be a reasonable NUMA node number (< 8 on most systems)
+        if !node.is_unknown() {
+            assert!(node.0 < 8, "NUMA node {} seems unreasonably high", node.0);
+        }
+    }
+    #[test]
+    fn test_get_device_numa_node_valid_gpu() {
+        // Test GPU 0 detection
+        let node = get_device_numa_node(0);
+        // Should return either a valid node (0-7) or use heuristic (gpu_id % 2)
+        // On dual-socket systems, GPU 0 typically on node 0 or 1
+        println!("GPU 0 detected on NUMA node: {}", node.0);
+    }
+    #[test]
+    fn test_numa_node_hash() {
+        // Verify NumaNode can be used as a HashMap key
+        use std::collections::HashMap;
+        let mut map = HashMap::new();
+        map.insert(NumaNode(0), "node0");
+        map.insert(NumaNode(1), "node1");
+        assert_eq!(map.get(&NumaNode(0)), Some(&"node0"));
+        assert_eq!(map.get(&NumaNode(1)), Some(&"node1"));
+        assert_eq!(map.get(&NumaNode(2)), None);
+    }
+    #[test]
+    fn test_numa_node_copy_clone() {
+        // Verify NumaNode is Copy and Clone
+        let node1 = NumaNode(5);
+        let node2 = node1; // Copy
+        let node3 = node1; // Clone
+        assert_eq!(node1, node2);
+        assert_eq!(node1, node3);
+        assert_eq!(node2, node3);
+    }
+}
--- a/lib/llm/src/block_manager/numa_allocator/topology.rs
+++ b/lib/llm/src/block_manager/numa_allocator/topology.rs
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//! NUMA topology detection
+//!
+//! This module provides utilities to read the actual CPU-to-NUMA mapping from the system,
+//! replacing heuristic assumptions with real topology data.
+use std::collections::HashMap;
+use std::fs;
+/// Global cached topology
+static TOPOLOGY: std::sync::OnceLock<Result<NumaTopology, String>> = std::sync::OnceLock::new();
+/// Represents the CPU topology for NUMA nodes
+pub struct NumaTopology {
+    /// Maps NUMA node ID -> list of CPU IDs
+    node_to_cpus: HashMap<u32, Vec<usize>>,
+    /// Maps CPU ID -> NUMA node ID
+    cpu_to_node: HashMap<usize, u32>,
+}
+impl NumaTopology {
+    /// Read NUMA topology from sysfs
+    pub fn from_sysfs() -> Result<Self, String> {
+        let mut node_to_cpus: HashMap<u32, Vec<usize>> = HashMap::new();
+        let mut cpu_to_node: HashMap<usize, u32> = HashMap::new();
+        // TODO: Read /sys/devices/system/node directory
+        let node_dir = std::path::Path::new("/sys/devices/system/node");
+        if !node_dir.exists() {
+            return Err("Node directory not found".to_string());
+        }
+        let entries =
+            fs::read_dir(node_dir).map_err(|e| format!("Failed to read node directory: {}", e))?;
+        for entry in entries.flatten() {
+            let path = entry.path();
+            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
+            // Only process "nodeN" directories
+            if !name.starts_with("node") {
+                continue;
+            }
+            // Extract node number
+            let node_id: u32 = name[4..]
+                .parse()
+                .map_err(|_| format!("Invalid node dir: {}", name))?;
+            // Read cpulist file
+            let cpulist_path = path.join("cpulist");
+            if !cpulist_path.exists() {
+                continue;
+            }
+            let cpulist = fs::read_to_string(&cpulist_path)
+                .map_err(|e| format!("Failed to read {}: {}", cpulist_path.display(), e))?;
+            let cpus = parse_cpulist(cpulist.trim())?;
+            // Populate both maps
+            for cpu in &cpus {
+                cpu_to_node.insert(*cpu, node_id);
+            }
+            node_to_cpus.insert(node_id, cpus);
+        }
+        if node_to_cpus.is_empty() {
+            return Err("No NUMA nodes found".to_string());
+        }
+        Ok(Self {
+            node_to_cpus,
+            cpu_to_node,
+        })
+    }
+    /// Get all CPUs for a NUMA node
+    pub fn cpus_for_node(&self, node_id: u32) -> Option<&[usize]> {
+        self.node_to_cpus.get(&node_id).map(|v| v.as_slice())
+    }
+    /// Get NUMA node for a CPU
+    pub fn node_for_cpu(&self, cpu_id: usize) -> Option<u32> {
+        self.cpu_to_node.get(&cpu_id).copied()
+    }
+    /// Get number of NUMA nodes
+    pub fn num_nodes(&self) -> usize {
+        self.node_to_cpus.len()
+    }
+    /// Check if single-node system
+    pub fn is_single_node(&self) -> bool {
+        self.num_nodes() == 1
+    }
+}
+/// Parse Linux cpulist format
+/// Examples:
+///   "0-15"        -> [0,1,2,...,15]
+///   "0,4,8"       -> [0,4,8]
+///   "0-3,8-11"    -> [0,1,2,3,8,9,10,11]
+fn parse_cpulist(cpulist: &str) -> Result<Vec<usize>, String> {
+    let mut cpus = Vec::new();
+    for part in cpulist.split(',') {
+        if part.contains('-') {
+            // Range: "0-15"
+            let range: Vec<&str> = part.split('-').collect();
+            if range.len() != 2 {
+                return Err(format!("Invalid range: {}", part));
+            }
+            let start: usize = range[0]
+                .parse()
+                .map_err(|_| format!("Invalid CPU ID: {}", range[0]))?;
+            let end: usize = range[1]
+                .parse()
+                .map_err(|_| format!("Invalid CPU ID: {}", range[1]))?;
+            for cpu in start..=end {
+                cpus.push(cpu);
+            }
+        } else {
+            // Single CPU
+            let cpu: usize = part
+                .parse()
+                .map_err(|_| format!("Invalid CPU ID: {}", part))?;
+            cpus.push(cpu);
+        }
+    }
+    cpus.sort_unstable();
+    cpus.dedup();
+    Ok(cpus)
+}
+/// Get the global NUMA topology (cached after first call)
+///
+/// Returns an error if NUMA topology cannot be read from sysfs. This indicates either:
+/// - System doesn't support NUMA
+/// - `/sys` is not mounted (e.g., restricted container)
+/// - Kernel NUMA support is disabled
+///
+/// Callers should handle errors gracefully by disabling NUMA optimizations.
+pub fn get_numa_topology() -> Result<&'static NumaTopology, &'static str> {
+    TOPOLOGY
+        .get_or_init(NumaTopology::from_sysfs)
+        .as_ref()
+        .map_err(|e| {
+            tracing::warn!("NUMA topology unavailable: {}", e);
+            "NUMA topology unavailable"
+        })
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_parse_cpulist_range() {
+        let cpus = parse_cpulist("0-3").unwrap();
+        assert_eq!(cpus, vec![0, 1, 2, 3]);
+    }
+    #[test]
+    fn test_parse_cpulist_list() {
+        let cpus = parse_cpulist("0,4,8").unwrap();
+        assert_eq!(cpus, vec![0, 4, 8]);
+    }
+    #[test]
+    fn test_parse_cpulist_mixed() {
+        let cpus = parse_cpulist("0-2,8,16-17").unwrap();
+        assert_eq!(cpus, vec![0, 1, 2, 8, 16, 17]);
+    }
+    #[test]
+    fn test_parse_cpulist_ht() {
+        // Hyperthreading: 0-15,32-47 (physical cores 0-15, HT siblings 32-47)
+        let cpus = parse_cpulist("0-15,32-47").unwrap();
+        assert_eq!(cpus.len(), 32);
+        assert_eq!(cpus[0], 0);
+        assert_eq!(cpus[15], 15);
+        assert_eq!(cpus[16], 32);
+        assert_eq!(cpus[31], 47);
+    }
+    #[test]
+    fn test_parse_cpulist_real_numa_system() {
+        // Real dual-socket system with hyperthreading (discovered pattern)
+        // Node 0: CPUs 0-15, 128-143
+        let cpus = parse_cpulist("0-15,128-143").unwrap();
+        assert_eq!(cpus.len(), 32);
+        assert_eq!(cpus[0], 0);
+        assert_eq!(cpus[15], 15);
+        assert_eq!(cpus[16], 128);
+        assert_eq!(cpus[31], 143);
+        // Node 1: CPUs 16-31, 144-159
+        let cpus = parse_cpulist("16-31,144-159").unwrap();
+        assert_eq!(cpus.len(), 32);
+        assert_eq!(cpus[0], 16);
+        assert_eq!(cpus[15], 31);
+        assert_eq!(cpus[16], 144);
+        assert_eq!(cpus[31], 159);
+    }
+    #[test]
+    fn test_parse_cpulist_out_of_order() {
+        // Test that parser handles out-of-order input (seen in some systems)
+        let cpus = parse_cpulist("4,2,0,1,3").unwrap();
+        assert_eq!(cpus, vec![0, 1, 2, 3, 4]); // Should be sorted
+    }
+    #[test]
+    fn test_parse_cpulist_duplicates() {
+        // Test deduplication (in case kernel reports duplicates)
+        let cpus = parse_cpulist("0-2,1-3").unwrap();
+        assert_eq!(cpus, vec![0, 1, 2, 3]); // Should remove duplicates
+    }
+    #[test]
+    fn test_parse_cpulist_empty() {
+        // Edge case: empty cpulist
+        let result = parse_cpulist("");
+        assert!(result.is_err() || result.unwrap().is_empty());
+    }
+    #[test]
+    fn test_parse_cpulist_single_cpu() {
+        // Single CPU node (uncommon but valid)
+        let cpus = parse_cpulist("5").unwrap();
+        assert_eq!(cpus, vec![5]);
+    }
+    #[test]
+    fn test_topology_bidirectional_lookup() {
+        // Test that node->cpu and cpu->node mappings are consistent
+        let mut node_to_cpus = std::collections::HashMap::new();
+        let mut cpu_to_node = std::collections::HashMap::new();
+        node_to_cpus.insert(0, vec![0, 1, 2, 3]);
+        node_to_cpus.insert(1, vec![4, 5, 6, 7]);
+        for (node, cpus) in &node_to_cpus {
+            for cpu in cpus {
+                cpu_to_node.insert(*cpu, *node);
+            }
+        }
+        let topology = NumaTopology {
+            node_to_cpus,
+            cpu_to_node,
+        };
+        // Verify forward lookup (node -> cpus)
+        assert_eq!(topology.cpus_for_node(0), Some(&[0, 1, 2, 3][..]));
+        assert_eq!(topology.cpus_for_node(1), Some(&[4, 5, 6, 7][..]));
+        // Verify reverse lookup (cpu -> node)
+        assert_eq!(topology.node_for_cpu(0), Some(0));
+        assert_eq!(topology.node_for_cpu(3), Some(0));
+        assert_eq!(topology.node_for_cpu(4), Some(1));
+        assert_eq!(topology.node_for_cpu(7), Some(1));
+        // Verify unknown CPU
+        assert_eq!(topology.node_for_cpu(999), None);
+    }
+}
--- a/lib/llm/src/block_manager/numa_allocator/worker_pool.rs
+++ b/lib/llm/src/block_manager/numa_allocator/worker_pool.rs
--- a/lib/llm/src/block_manager/storage/cuda.rs
+++ b/lib/llm/src/block_manager/storage/cuda.rs
@@ -76,6 +76,8 @@ use std::{
 use cudarc::driver::{CudaContext, sys};
+use crate::block_manager::numa_allocator;
 /// Trait for [Storage] types that can be accessed by CUDA
 pub trait CudaAccessible: Storage {}
@@ -176,10 +178,27 @@ impl PinnedStorage {
        unsafe {
            ctx.bind_to_thread().map_err(StorageError::Cuda)?;
-            let ptr = cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
+            // Try NUMA-aware allocation if enabled, otherwise use direct allocation
-                .map_err(StorageError::Cuda)?;
+            let ptr = if numa_allocator::is_numa_enabled() {
+                let device_id = ctx.cu_device() as u32;
+                match numa_allocator::worker_pool::NumaWorkerPool::global()
+                    .allocate_pinned_for_gpu(size, device_id)
+                {
+                    Ok(ptr) => ptr,
+                    Err(e) => {
+                        tracing::warn!("NUMA allocation failed: {}, using direct allocation", e);
+                        cudarc::driver::result::malloc_host(
+                            size,
+                            sys::CU_MEMHOSTALLOC_WRITECOMBINED,
+                        )
+                        .map_err(StorageError::Cuda)? as *mut u8
+                    }
+                }
+            } else {
+                cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
+                    .map_err(StorageError::Cuda)? as *mut u8
+            };
-            let ptr = ptr as *mut u8;
            assert!(!ptr.is_null(), "Failed to allocate pinned memory");
            assert!(ptr.is_aligned(), "Pinned memory is not aligned");
            assert!(size < isize::MAX as usize);