Unverified Commit 67d27bcc authored by Olga Andreeva's avatar Olga Andreeva Committed by GitHub
Browse files

fix: fixing the NUMA sensitivity problem in KVBM for TP=1 (#3700)


Signed-off-by: default avatarOlga Andreeva <oandreeva@nvidia.com>
parent f6ed01b1
...@@ -16,6 +16,7 @@ pub mod distributed; ...@@ -16,6 +16,7 @@ pub mod distributed;
pub mod events; pub mod events;
pub mod layout; pub mod layout;
pub mod metrics_kvbm; pub mod metrics_kvbm;
pub mod numa_allocator;
pub mod offload; pub mod offload;
pub mod pool; pub mod pool;
pub mod storage; pub mod storage;
......
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
pub mod topology;
pub mod worker_pool;
use nix::libc;
use serde::{Deserialize, Serialize};
use std::{mem, process::Command};
/// Check if NUMA optimization is enabled via environment variable
///
/// Set `DYN_KVBM_ENABLE_NUMA=1` to enable NUMA-aware allocation.
/// Default: disabled (opt-in)
pub fn is_numa_enabled() -> bool {
std::env::var("DYN_KVBM_ENABLE_NUMA")
.map(|v| v == "1" || v.to_lowercase() == "true")
.unwrap_or(false)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct NumaNode(pub u32);
impl NumaNode {
pub const UNKNOWN: NumaNode = NumaNode(u32::MAX);
pub fn is_unknown(&self) -> bool {
self.0 == u32::MAX
}
}
impl std::fmt::Display for NumaNode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.is_unknown() {
write!(f, "UNKNOWN")
} else {
write!(f, "NumaNode({})", self.0)
}
}
}
/// Get the current CPU's NUMA node
///
/// Uses the Linux `getcpu` syscall to determine which NUMA node the current CPU belongs to.
/// Returns `NumaNode::UNKNOWN` if the syscall fails.
pub fn get_current_cpu_numa_node() -> NumaNode {
unsafe {
let mut cpu: libc::c_uint = 0;
let mut node: libc::c_uint = 0;
// getcpu syscall: int getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache);
let result = libc::syscall(
libc::SYS_getcpu,
&mut cpu,
&mut node,
std::ptr::null_mut::<libc::c_void>(),
);
if result == 0 {
NumaNode(node)
} else {
NumaNode::UNKNOWN
}
}
}
/// Get NUMA node for device (GPU) memory
///
/// For GPU memory, the NUMA affinity depends on which PCIe bus the GPU is attached to.
/// This can be queried via nvidia-smi.
pub fn get_device_numa_node(device_id: u32) -> NumaNode {
// Use nvidia-smi topo to get NUMA ID of nearest CPU
// This directly returns the NUMA node
let output = match Command::new("nvidia-smi")
.args([
"topo",
"--get-numa-id-of-nearby-cpu",
"-i",
&device_id.to_string(),
])
.output()
{
Ok(out) if out.status.success() => out,
_ => {
tracing::warn!("nvidia-smi failed for GPU {}, using heuristic", device_id);
return NumaNode(device_id % 2);
}
};
if let Ok(stdout) = std::str::from_utf8(&output.stdout)
&& let Some(line) = stdout.lines().next()
&& let Some(numa_str) = line.split(':').nth(1)
&& let Ok(node) = numa_str.trim().parse::<u32>()
{
tracing::trace!("GPU {} on NUMA node {}", device_id, node);
return NumaNode(node);
}
tracing::warn!("Failed to get NUMA node for GPU {}", device_id);
NumaNode::UNKNOWN
}
/// Pin the current thread to a specific NUMA node's CPUs
///
/// This sets the CPU affinity for the calling thread to only run on CPUs
/// belonging to the specified NUMA node. This is critical for ensuring
/// that memory allocations follow the first-touch policy on the correct node.
pub fn pin_thread_to_numa_node(node: NumaNode) -> Result<(), String> {
let topology =
topology::get_numa_topology().map_err(|e| format!("Can not get NUMA topology: {}", e))?;
let cpus = topology
.cpus_for_node(node.0)
.ok_or_else(|| format!("No CPUs found for NUMA node {}", node.0))?;
if cpus.is_empty() {
return Err(format!("No CPUs found for NUMA node {}", node.0));
}
unsafe {
let mut cpu_set: libc::cpu_set_t = mem::zeroed();
for cpu in cpus {
libc::CPU_SET(*cpu, &mut cpu_set);
}
let result = libc::sched_setaffinity(
0, // current thread
mem::size_of::<libc::cpu_set_t>(),
&cpu_set,
);
if result != 0 {
let err = std::io::Error::last_os_error();
return Err(format!("Failed to set CPU affinity: {}", err));
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_numa_node_equality() {
let node0a = NumaNode(0);
let node0b = NumaNode(0);
let node1 = NumaNode(1);
assert_eq!(node0a, node0b);
assert_ne!(node0a, node1);
}
#[test]
fn test_numa_node_unknown() {
let unknown = NumaNode::UNKNOWN;
assert!(unknown.is_unknown());
assert_eq!(unknown.0, u32::MAX);
let valid = NumaNode(0);
assert!(!valid.is_unknown());
}
#[test]
fn test_numa_node_display() {
assert_eq!(format!("{}", NumaNode(0)), "NumaNode(0)");
assert_eq!(format!("{}", NumaNode(7)), "NumaNode(7)");
assert_eq!(format!("{}", NumaNode::UNKNOWN), "UNKNOWN");
}
#[test]
fn test_numa_node_serialization() {
// Verify NumaNode can be serialized (important for benchmarking)
let node = NumaNode(1);
let json = serde_json::to_string(&node).unwrap();
let deserialized: NumaNode = serde_json::from_str(&json).unwrap();
assert_eq!(node, deserialized);
}
#[test]
fn test_get_current_cpu_numa_node() {
// Should either return a valid node or UNKNOWN
let node = get_current_cpu_numa_node();
// If not unknown, should be a reasonable NUMA node number (< 8 on most systems)
if !node.is_unknown() {
assert!(node.0 < 8, "NUMA node {} seems unreasonably high", node.0);
}
}
#[test]
fn test_get_device_numa_node_valid_gpu() {
// Test GPU 0 detection
let node = get_device_numa_node(0);
// Should return either a valid node (0-7) or use heuristic (gpu_id % 2)
// On dual-socket systems, GPU 0 typically on node 0 or 1
println!("GPU 0 detected on NUMA node: {}", node.0);
}
#[test]
fn test_numa_node_hash() {
// Verify NumaNode can be used as a HashMap key
use std::collections::HashMap;
let mut map = HashMap::new();
map.insert(NumaNode(0), "node0");
map.insert(NumaNode(1), "node1");
assert_eq!(map.get(&NumaNode(0)), Some(&"node0"));
assert_eq!(map.get(&NumaNode(1)), Some(&"node1"));
assert_eq!(map.get(&NumaNode(2)), None);
}
#[test]
fn test_numa_node_copy_clone() {
// Verify NumaNode is Copy and Clone
let node1 = NumaNode(5);
let node2 = node1; // Copy
let node3 = node1; // Clone
assert_eq!(node1, node2);
assert_eq!(node1, node3);
assert_eq!(node2, node3);
}
}
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! NUMA topology detection
//!
//! This module provides utilities to read the actual CPU-to-NUMA mapping from the system,
//! replacing heuristic assumptions with real topology data.
use std::collections::HashMap;
use std::fs;
/// Global cached topology
static TOPOLOGY: std::sync::OnceLock<Result<NumaTopology, String>> = std::sync::OnceLock::new();
/// Represents the CPU topology for NUMA nodes
pub struct NumaTopology {
/// Maps NUMA node ID -> list of CPU IDs
node_to_cpus: HashMap<u32, Vec<usize>>,
/// Maps CPU ID -> NUMA node ID
cpu_to_node: HashMap<usize, u32>,
}
impl NumaTopology {
/// Read NUMA topology from sysfs
pub fn from_sysfs() -> Result<Self, String> {
let mut node_to_cpus: HashMap<u32, Vec<usize>> = HashMap::new();
let mut cpu_to_node: HashMap<usize, u32> = HashMap::new();
// TODO: Read /sys/devices/system/node directory
let node_dir = std::path::Path::new("/sys/devices/system/node");
if !node_dir.exists() {
return Err("Node directory not found".to_string());
}
let entries =
fs::read_dir(node_dir).map_err(|e| format!("Failed to read node directory: {}", e))?;
for entry in entries.flatten() {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
// Only process "nodeN" directories
if !name.starts_with("node") {
continue;
}
// Extract node number
let node_id: u32 = name[4..]
.parse()
.map_err(|_| format!("Invalid node dir: {}", name))?;
// Read cpulist file
let cpulist_path = path.join("cpulist");
if !cpulist_path.exists() {
continue;
}
let cpulist = fs::read_to_string(&cpulist_path)
.map_err(|e| format!("Failed to read {}: {}", cpulist_path.display(), e))?;
let cpus = parse_cpulist(cpulist.trim())?;
// Populate both maps
for cpu in &cpus {
cpu_to_node.insert(*cpu, node_id);
}
node_to_cpus.insert(node_id, cpus);
}
if node_to_cpus.is_empty() {
return Err("No NUMA nodes found".to_string());
}
Ok(Self {
node_to_cpus,
cpu_to_node,
})
}
/// Get all CPUs for a NUMA node
pub fn cpus_for_node(&self, node_id: u32) -> Option<&[usize]> {
self.node_to_cpus.get(&node_id).map(|v| v.as_slice())
}
/// Get NUMA node for a CPU
pub fn node_for_cpu(&self, cpu_id: usize) -> Option<u32> {
self.cpu_to_node.get(&cpu_id).copied()
}
/// Get number of NUMA nodes
pub fn num_nodes(&self) -> usize {
self.node_to_cpus.len()
}
/// Check if single-node system
pub fn is_single_node(&self) -> bool {
self.num_nodes() == 1
}
}
/// Parse Linux cpulist format
/// Examples:
/// "0-15" -> [0,1,2,...,15]
/// "0,4,8" -> [0,4,8]
/// "0-3,8-11" -> [0,1,2,3,8,9,10,11]
fn parse_cpulist(cpulist: &str) -> Result<Vec<usize>, String> {
let mut cpus = Vec::new();
for part in cpulist.split(',') {
if part.contains('-') {
// Range: "0-15"
let range: Vec<&str> = part.split('-').collect();
if range.len() != 2 {
return Err(format!("Invalid range: {}", part));
}
let start: usize = range[0]
.parse()
.map_err(|_| format!("Invalid CPU ID: {}", range[0]))?;
let end: usize = range[1]
.parse()
.map_err(|_| format!("Invalid CPU ID: {}", range[1]))?;
for cpu in start..=end {
cpus.push(cpu);
}
} else {
// Single CPU
let cpu: usize = part
.parse()
.map_err(|_| format!("Invalid CPU ID: {}", part))?;
cpus.push(cpu);
}
}
cpus.sort_unstable();
cpus.dedup();
Ok(cpus)
}
/// Get the global NUMA topology (cached after first call)
///
/// Returns an error if NUMA topology cannot be read from sysfs. This indicates either:
/// - System doesn't support NUMA
/// - `/sys` is not mounted (e.g., restricted container)
/// - Kernel NUMA support is disabled
///
/// Callers should handle errors gracefully by disabling NUMA optimizations.
pub fn get_numa_topology() -> Result<&'static NumaTopology, &'static str> {
TOPOLOGY
.get_or_init(NumaTopology::from_sysfs)
.as_ref()
.map_err(|e| {
tracing::warn!("NUMA topology unavailable: {}", e);
"NUMA topology unavailable"
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_cpulist_range() {
let cpus = parse_cpulist("0-3").unwrap();
assert_eq!(cpus, vec![0, 1, 2, 3]);
}
#[test]
fn test_parse_cpulist_list() {
let cpus = parse_cpulist("0,4,8").unwrap();
assert_eq!(cpus, vec![0, 4, 8]);
}
#[test]
fn test_parse_cpulist_mixed() {
let cpus = parse_cpulist("0-2,8,16-17").unwrap();
assert_eq!(cpus, vec![0, 1, 2, 8, 16, 17]);
}
#[test]
fn test_parse_cpulist_ht() {
// Hyperthreading: 0-15,32-47 (physical cores 0-15, HT siblings 32-47)
let cpus = parse_cpulist("0-15,32-47").unwrap();
assert_eq!(cpus.len(), 32);
assert_eq!(cpus[0], 0);
assert_eq!(cpus[15], 15);
assert_eq!(cpus[16], 32);
assert_eq!(cpus[31], 47);
}
#[test]
fn test_parse_cpulist_real_numa_system() {
// Real dual-socket system with hyperthreading (discovered pattern)
// Node 0: CPUs 0-15, 128-143
let cpus = parse_cpulist("0-15,128-143").unwrap();
assert_eq!(cpus.len(), 32);
assert_eq!(cpus[0], 0);
assert_eq!(cpus[15], 15);
assert_eq!(cpus[16], 128);
assert_eq!(cpus[31], 143);
// Node 1: CPUs 16-31, 144-159
let cpus = parse_cpulist("16-31,144-159").unwrap();
assert_eq!(cpus.len(), 32);
assert_eq!(cpus[0], 16);
assert_eq!(cpus[15], 31);
assert_eq!(cpus[16], 144);
assert_eq!(cpus[31], 159);
}
#[test]
fn test_parse_cpulist_out_of_order() {
// Test that parser handles out-of-order input (seen in some systems)
let cpus = parse_cpulist("4,2,0,1,3").unwrap();
assert_eq!(cpus, vec![0, 1, 2, 3, 4]); // Should be sorted
}
#[test]
fn test_parse_cpulist_duplicates() {
// Test deduplication (in case kernel reports duplicates)
let cpus = parse_cpulist("0-2,1-3").unwrap();
assert_eq!(cpus, vec![0, 1, 2, 3]); // Should remove duplicates
}
#[test]
fn test_parse_cpulist_empty() {
// Edge case: empty cpulist
let result = parse_cpulist("");
assert!(result.is_err() || result.unwrap().is_empty());
}
#[test]
fn test_parse_cpulist_single_cpu() {
// Single CPU node (uncommon but valid)
let cpus = parse_cpulist("5").unwrap();
assert_eq!(cpus, vec![5]);
}
#[test]
fn test_topology_bidirectional_lookup() {
// Test that node->cpu and cpu->node mappings are consistent
let mut node_to_cpus = std::collections::HashMap::new();
let mut cpu_to_node = std::collections::HashMap::new();
node_to_cpus.insert(0, vec![0, 1, 2, 3]);
node_to_cpus.insert(1, vec![4, 5, 6, 7]);
for (node, cpus) in &node_to_cpus {
for cpu in cpus {
cpu_to_node.insert(*cpu, *node);
}
}
let topology = NumaTopology {
node_to_cpus,
cpu_to_node,
};
// Verify forward lookup (node -> cpus)
assert_eq!(topology.cpus_for_node(0), Some(&[0, 1, 2, 3][..]));
assert_eq!(topology.cpus_for_node(1), Some(&[4, 5, 6, 7][..]));
// Verify reverse lookup (cpu -> node)
assert_eq!(topology.node_for_cpu(0), Some(0));
assert_eq!(topology.node_for_cpu(3), Some(0));
assert_eq!(topology.node_for_cpu(4), Some(1));
assert_eq!(topology.node_for_cpu(7), Some(1));
// Verify unknown CPU
assert_eq!(topology.node_for_cpu(999), None);
}
}
This diff is collapsed.
...@@ -76,6 +76,8 @@ use std::{ ...@@ -76,6 +76,8 @@ use std::{
use cudarc::driver::{CudaContext, sys}; use cudarc::driver::{CudaContext, sys};
use crate::block_manager::numa_allocator;
/// Trait for [Storage] types that can be accessed by CUDA /// Trait for [Storage] types that can be accessed by CUDA
pub trait CudaAccessible: Storage {} pub trait CudaAccessible: Storage {}
...@@ -176,10 +178,27 @@ impl PinnedStorage { ...@@ -176,10 +178,27 @@ impl PinnedStorage {
unsafe { unsafe {
ctx.bind_to_thread().map_err(StorageError::Cuda)?; ctx.bind_to_thread().map_err(StorageError::Cuda)?;
let ptr = cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED) // Try NUMA-aware allocation if enabled, otherwise use direct allocation
.map_err(StorageError::Cuda)?; let ptr = if numa_allocator::is_numa_enabled() {
let device_id = ctx.cu_device() as u32;
match numa_allocator::worker_pool::NumaWorkerPool::global()
.allocate_pinned_for_gpu(size, device_id)
{
Ok(ptr) => ptr,
Err(e) => {
tracing::warn!("NUMA allocation failed: {}, using direct allocation", e);
cudarc::driver::result::malloc_host(
size,
sys::CU_MEMHOSTALLOC_WRITECOMBINED,
)
.map_err(StorageError::Cuda)? as *mut u8
}
}
} else {
cudarc::driver::result::malloc_host(size, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
.map_err(StorageError::Cuda)? as *mut u8
};
let ptr = ptr as *mut u8;
assert!(!ptr.is_null(), "Failed to allocate pinned memory"); assert!(!ptr.is_null(), "Failed to allocate pinned memory");
assert!(ptr.is_aligned(), "Pinned memory is not aligned"); assert!(ptr.is_aligned(), "Pinned memory is not aligned");
assert!(size < isize::MAX as usize); assert!(size < isize::MAX as usize);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment