Unverified Commit 008683d6 authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: adding kvbm-engine (#6773)


Signed-off-by: default avatarRyan Olson <rolson@nvidia.com>
parent cf79c4fc
......@@ -14,7 +14,7 @@ use uuid::Uuid;
use dynamo_memory::CudaMemPool;
use dynamo_memory::nixl::{NixlAgent, NixlBackendConfig, XferRequest};
use velo_events::EventManager;
use velo::EventManager;
use crate::manager::TransferManager;
......
......@@ -15,7 +15,7 @@ use tokio::sync::mpsc;
use tokio::time::interval;
use tracing::{error, warn};
use uuid::Uuid;
use velo_events::{EventHandle, EventManager};
use velo::{EventHandle, EventManager};
pub mod cuda_event;
pub mod nixl_events;
......
......@@ -12,7 +12,7 @@ use tokio::sync::mpsc;
use tokio::time::interval;
use tracing::{error, warn};
use uuid::Uuid;
use velo_events::{EventHandle, EventManager};
use velo::{EventHandle, EventManager};
/// Registration message for NIXL notification-based transfer completion.
pub struct RegisterNixlNotification {
......@@ -276,7 +276,7 @@ mod tests {
use std::collections::VecDeque;
use std::sync::Mutex;
use tokio::task::yield_now;
use velo_events::EventStatus;
use velo::EventStatus;
// ── Mock notification source ────────────────────────────────────
......
......@@ -10,7 +10,7 @@ use std::{
sync::Arc,
task::{Context, Poll},
};
use velo_events::{Event, EventAwaiter, EventManager};
use velo::{Event, EventAwaiter, EventManager};
pub enum TransferAwaiter {
Local(EventAwaiter),
......
......@@ -9,7 +9,6 @@
//! - Different transfer strategies (Memcpy, CUDA H2D/D2H)
use super::*;
use crate::transfer::TransferCapabilities;
use crate::transfer::executor::TransferOptionsInternal;
use crate::transfer::executor::execute_transfer;
use crate::transfer::{can_use_whole_block_transfer, validate_layout_compatibility};
......
......@@ -68,13 +68,8 @@ use crate::{
},
};
use anyhow::Result;
use cudarc::driver::sys::CUdevice_attribute_enum;
use cudarc::driver::{CudaContext, CudaStream, LaunchConfig, PushKernelArg};
use cudarc::nvrtc::{CompileOptions, compile_ptx_with_opts};
use std::collections::HashMap;
use std::ops::Range;
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant};
/// Layout kind for parameterized testing.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
......@@ -404,169 +399,3 @@ pub fn verify_guard_blocks_unchanged(
Ok(())
}
/// CUDA sleep kernel source code.
const SLEEP_KERNEL_SRC: &str = r#"
extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
const unsigned long long start = clock64();
while ((clock64() - start) < min_cycles) {
asm volatile("");
}
}
"#;
/// A reusable CUDA sleep utility for tests.
///
/// This struct provides a simple interface to execute GPU sleep operations
/// with calibrated timing. It compiles the sleep kernel once per CUDA context
/// and caches the calibration for reuse.
///
/// The calibration is conservative (prefers longer sleep durations over shorter)
/// to ensure minimum sleep times are met.
pub struct CudaSleep {
function: cudarc::driver::CudaFunction,
cycles_per_ms: f64,
}
impl CudaSleep {
/// Get or create a CudaSleep instance for the given CUDA context.
///
/// This function uses lazy initialization and caches instances per device ID.
/// The first call for each device will compile the kernel and run calibration.
///
/// # Arguments
/// * `cuda_ctx` - The CUDA context to use
///
/// # Returns
/// A shared reference to the CudaSleep instance for this context's device.
pub fn for_context(cuda_ctx: &Arc<CudaContext>) -> Result<Arc<Self>> {
static INSTANCES: OnceLock<parking_lot::Mutex<HashMap<usize, Arc<CudaSleep>>>> =
OnceLock::new();
let instances = INSTANCES.get_or_init(|| parking_lot::Mutex::new(HashMap::new()));
let device_ordinal = cuda_ctx.ordinal();
// Fast path: check if instance already exists
{
let instances_guard = instances.lock();
if let Some(instance) = instances_guard.get(&device_ordinal) {
return Ok(Arc::clone(instance));
}
}
// Slow path: create new instance with calibration
let instance = Arc::new(Self::new(cuda_ctx)?);
// Store in cache
let mut instances_guard = instances.lock();
instances_guard
.entry(device_ordinal)
.or_insert_with(|| Arc::clone(&instance));
Ok(instance)
}
/// Create a new CudaSleep instance with calibration.
///
/// This compiles the sleep kernel and runs a calibration loop to determine
/// the relationship between clock cycles and wall-clock time.
fn new(cuda_ctx: &Arc<CudaContext>) -> Result<Self> {
// Get device compute capability
let major = cuda_ctx
.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
let minor = cuda_ctx
.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
// Compile PTX for this device
let mut compile_opts = CompileOptions {
name: Some("sleep_kernel.cu".into()),
..Default::default()
};
compile_opts
.options
.push(format!("--gpu-architecture=compute_{}{}", major, minor));
let ptx = compile_ptx_with_opts(SLEEP_KERNEL_SRC, compile_opts)?;
let module = cuda_ctx.load_module(ptx)?;
let function = module.load_function("sleep_kernel")?;
// Get device clock rate
let clock_rate_khz =
cuda_ctx.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_CLOCK_RATE)? as u64;
// Create a temporary stream for calibration
let stream = cuda_ctx.new_stream()?;
// Warm up to absorb JIT overhead
let warm_cycles = clock_rate_khz.saturating_mul(10).max(1);
Self::launch_kernel(&function, &stream, warm_cycles)?;
stream.synchronize()?;
// Run calibration loop
let desired_delay = Duration::from_millis(600);
let mut target_cycles = clock_rate_khz.saturating_mul(50).max(1); // ~50ms starting point
let mut actual_duration = Duration::ZERO;
for _ in 0..8 {
let start = Instant::now();
Self::launch_kernel(&function, &stream, target_cycles)?;
stream.synchronize()?;
actual_duration = start.elapsed();
if actual_duration >= desired_delay {
break;
}
target_cycles = target_cycles.saturating_mul(2);
}
// Calculate cycles per millisecond with conservative 20% margin
// (prefer longer sleeps over shorter)
let cycles_per_ms = if actual_duration.as_millis() > 0 {
(target_cycles as f64 / actual_duration.as_millis() as f64) * 1.2
} else {
clock_rate_khz as f64 // Fallback to clock rate
};
Ok(Self {
function,
cycles_per_ms,
})
}
/// Launch the sleep kernel with the specified number of cycles.
fn launch_kernel(
function: &cudarc::driver::CudaFunction,
stream: &Arc<CudaStream>,
cycles: u64,
) -> Result<()> {
let launch_cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
let mut launch = stream.launch_builder(function);
unsafe {
launch.arg(&cycles);
launch.launch(launch_cfg)?;
}
Ok(())
}
/// Launch a sleep operation on the given stream.
///
/// This queues a GPU kernel that will sleep for approximately the specified
/// duration. The sleep is conservative and may take longer than requested.
///
/// # Arguments
/// * `duration` - The minimum duration to sleep
/// * `stream` - The CUDA stream to launch the kernel on
///
/// # Returns
/// Ok(()) if the kernel was successfully queued
pub fn launch(&self, duration: Duration, stream: &Arc<CudaStream>) -> Result<()> {
let target_cycles = (duration.as_millis() as f64 * self.cycles_per_ms) as u64;
Self::launch_kernel(&self.function, stream, target_cycles)
}
}
......@@ -48,6 +48,7 @@ required-features = ["kv-router-stress"]
[dependencies]
# repo
dynamo-config = { workspace = true }
dynamo-kv-router = { workspace = true, features = ["metrics", "runtime-protocols"] }
dynamo-memory = { workspace = true }
dynamo-mocker = { workspace = true }
......@@ -64,6 +65,7 @@ async-trait = { workspace = true }
async-nats = { workspace = true }
bytes = { workspace = true }
chrono = { workspace = true }
dashmap = { workspace = true }
derive_builder = { workspace = true }
either = { workspace = true }
futures = { workspace = true }
......@@ -89,14 +91,14 @@ uuid = { workspace = true }
xxhash-rust = { workspace = true }
modelexpress-client = { workspace = true }
modelexpress-common = { workspace = true }
bitflags = { version = "2.4", features = ["serde"] }
blake3 = { version = "1.8", features = ["mmap", "rayon"] }
bytemuck = "1.22"
# candle-core = { version = "0.9.1" }
derive-getters = "0.5"
offset-allocator = "0.2"
rayon = "1"
dashmap = { version = "5.5.3" }
bincode = { version = "2.0.1", features = ["serde", "derive"] }
# lora
......
......@@ -98,7 +98,8 @@ pub enum ResetResponse {
mod tests {
use crate::tokens::Tokens;
use super::super::tests::create_reference_block_manager_with_counts;
use super::super::ReferenceBlockManager;
use super::super::tests::create_reference_block_manager_config_with_counts;
use super::*;
#[tokio::test]
......@@ -110,9 +111,11 @@ mod tests {
.await
.unwrap();
let worker_id = drt.connection_id();
let worker_id = drt.connection_id() as i64;
let block_manager = create_reference_block_manager_with_counts(8, 16, 0).await;
let config = create_reference_block_manager_config_with_counts(8, 16, 0);
let block_manager: ReferenceBlockManager =
ReferenceBlockManager::new(config).await.unwrap();
let component = drt
.namespace("test-kvbm")
......
......@@ -60,10 +60,7 @@ mod tests {
use anyhow::Result;
use rstest::*;
use std::sync::{
Arc,
atomic::{AtomicUsize, Ordering},
};
use std::sync::Arc;
use tokio_util::sync::CancellationToken;
use dynamo_runtime::logging::init as init_logging;
......
......@@ -21,13 +21,10 @@ pub use dynamo_memory::numa::*;
)]
pub fn is_numa_enabled() -> bool {
// Global kill switch always wins
if is_numa_disabled() {
if dynamo_memory::numa::is_numa_disabled() {
return false;
}
matches!(
std::env::var("DYN_KVBM_ENABLE_NUMA").as_deref(),
Ok("1" | "true" | "yes")
)
dynamo_config::env_is_truthy("DYN_KVBM_ENABLE_NUMA")
}
#[cfg(test)]
......
......@@ -84,7 +84,7 @@ impl DynamoCudaContextGuard {
/// The caller must ensure the context is valid.
pub unsafe fn new(context: CUcontext) -> Pin<Box<Self>> {
// Push the context onto the CUDA context stack
let result = cuCtxPushCurrent_v2(context);
let result = unsafe { cuCtxPushCurrent_v2(context) };
if result != cudaError_enum::CUDA_SUCCESS {
panic!("Failed to push CUDA context: {:?}", result);
}
......@@ -192,7 +192,7 @@ impl DynamoCudaContextProvider for CudaContext {
impl DynamoCudaContextProvider for CudaStream {
unsafe fn cu_context(&self) -> cudarc::driver::sys::CUcontext {
self.context().cu_context()
unsafe { self.context().cu_context() }
}
}
......
{
"permissions": {
"allow": [
"Bash(ls:*)",
"Bash(rustc --print sysroot:*)",
"Bash(cargo tree:*)",
"Bash(cargo metadata:*)",
"Bash(env:*)",
"Bash(ldd:*)",
"Bash(cargo test:*)"
]
}
}
......@@ -31,6 +31,7 @@ thiserror = { workspace = true }
tracing = { workspace = true }
libc = { version = "0.2" }
libloading = "0.8"
nix = { version = "0.30", features = ["fs"] }
offset-allocator = "0.2"
......
......@@ -142,7 +142,7 @@ fn main() {
println!("=======================");
println!("GPUs: {gpus:?}");
println!("Alloc size: {size_mib} MiB ({alloc_size} bytes)");
println!("NUMA disabled: {}", dynamo_memory::is_numa_disabled());
println!("NUMA enabled: {}", dynamo_memory::is_numa_enabled());
println!();
// Phase 1: Show GPU-to-NUMA mapping
......
......@@ -43,7 +43,7 @@ pub use device::DeviceStorage;
pub use disk::DiskStorage;
pub use external::ExternalDeviceMemory;
#[cfg(target_os = "linux")]
pub use numa::{NumaNode, is_numa_disabled};
pub use numa::{NumaNode, is_numa_disabled, is_numa_enabled};
pub use offset::OffsetBuffer;
pub use pinned::PinnedStorage;
pub use pool::{CudaMemPool, CudaMemPoolBuilder};
......
......@@ -25,10 +25,11 @@
//! places pages on the correct node. If the GPU's NUMA node cannot be
//! determined, allocation falls back to the non-NUMA path transparently.
pub(crate) mod nvml;
pub mod topology;
pub mod worker_pool;
use cudarc::driver::sys::CUdevice_attribute_enum;
use cudarc::driver::{result::device as cuda_device, sys as cuda_sys};
use nix::libc;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
......@@ -44,8 +45,13 @@ static NUMA_NODE_CACHE: OnceLock<Mutex<HashMap<String, Option<NumaNode>>>> = Onc
///
/// NUMA-aware allocation is enabled by default. Set `DYN_MEMORY_DISABLE_NUMA=1`
/// (or any truthy value) to disable it.
pub fn is_numa_enabled() -> bool {
!crate::env_is_truthy("DYN_MEMORY_DISABLE_NUMA")
}
/// Convenience inverse of [`is_numa_enabled`].
pub fn is_numa_disabled() -> bool {
crate::env_is_truthy("DYN_MEMORY_DISABLE_NUMA")
!is_numa_enabled()
}
/// Represents a NUMA node identifier.
......@@ -99,32 +105,6 @@ pub fn get_current_cpu_numa_node() -> NumaNode {
}
}
/// Format a PCI bus address from domain, bus, and device IDs.
///
/// Returns a string in the format `"DDDD:BB:DD.0"` suitable for sysfs lookups.
fn format_pci_bus_address(domain: i32, bus: i32, device: i32) -> String {
format!("{:04x}:{:02x}:{:02x}.0", domain, bus, device)
}
/// Query the PCI bus address for a CUDA device from the CUDA driver API.
///
/// Uses `CudaContext::attribute()` to read PCI domain, bus, and device IDs.
/// This transparently handles `CUDA_VISIBLE_DEVICES` remapping since
/// `CudaContext::new(ordinal)` operates on the process-local device index.
fn get_pci_bus_address_from_cuda(device_id: u32) -> Option<String> {
let ctx = crate::device::cuda_context(device_id).ok()?;
let domain = ctx
.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID)
.ok()?;
let bus = ctx
.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)
.ok()?;
let device = ctx
.attribute(CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)
.ok()?;
Some(format_pci_bus_address(domain, bus, device))
}
/// Read the NUMA node for a PCI device from sysfs.
///
/// Reads `/sys/bus/pci/devices/<pci_address>/numa_node`. Returns `None` if the
......@@ -274,6 +254,210 @@ pub fn pin_thread_to_numa_node(node: NumaNode) -> Result<(), String> {
Ok(())
}
/// Get PCI bus address for a CUDA device via the CUDA driver API.
///
/// Returns a normalized PCI address string like "0000:3b:00.0".
/// The device_id here is a CUDA ordinal (affected by CUDA_VISIBLE_DEVICES).
fn get_pci_bus_address_from_cuda(device_id: u32) -> Option<String> {
// SAFETY: We're calling CUDA driver API functions with valid device ordinals.
// cuDeviceGet and get_attribute are safe as long as CUDA is initialized
// (which CudaContext::new handles).
unsafe {
let mut dev = std::mem::MaybeUninit::uninit();
if cuda_sys::cuDeviceGet(dev.as_mut_ptr(), device_id as i32)
.result()
.is_err()
{
return None;
}
let dev = dev.assume_init();
let domain = cuda_device::get_attribute(
dev,
cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
)
.ok()?;
let bus = cuda_device::get_attribute(
dev,
cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
)
.ok()?;
let device = cuda_device::get_attribute(
dev,
cuda_sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
)
.ok()?;
Some(format!("{:04x}:{:02x}:{:02x}.0", domain, bus, device))
}
}
/// GPU info with PCI address and NUMA node, used for CPU set subdivision.
#[derive(Debug, Clone)]
struct GpuTopoInfo {
pci_address: String,
numa_node: Option<u32>,
}
/// Enumerate all GPUs visible to CUDA with their PCI addresses and NUMA nodes.
fn enumerate_cuda_gpus() -> Vec<GpuTopoInfo> {
let count = match cuda_device::get_count() {
Ok(c) => c,
Err(_) => return Vec::new(),
};
(0..count as u32)
.filter_map(|i| {
let pci = get_pci_bus_address_from_cuda(i)?;
let numa = read_numa_node_from_sysfs(&pci).map(|n| n.0);
Some(GpuTopoInfo {
pci_address: pci,
numa_node: numa,
})
})
.collect()
}
/// Enumerate all GPUs on the system, preferring NVML (sees all GPUs)
/// over CUDA driver (only sees CUDA_VISIBLE_DEVICES).
fn enumerate_all_gpus() -> Vec<GpuTopoInfo> {
// Try NVML first — it sees all GPUs regardless of CUDA_VISIBLE_DEVICES
if let Some(nvml) = nvml::try_nvml() {
let nvml_gpus = nvml.enumerate_gpus();
if !nvml_gpus.is_empty() {
tracing::debug!(
"NVML enumerated {} GPUs (ignoring CUDA_VISIBLE_DEVICES)",
nvml_gpus.len()
);
return nvml_gpus
.into_iter()
.map(|g| {
let numa = read_numa_node_from_sysfs(&g.pci_address).map(|n| n.0);
GpuTopoInfo {
pci_address: g.pci_address,
numa_node: numa,
}
})
.collect();
}
}
// Fallback: enumerate via CUDA driver (may miss hidden devices)
tracing::debug!("Falling back to CUDA driver GPU enumeration");
enumerate_cuda_gpus()
}
/// Cached CPU set results per CUDA device ordinal.
static DEVICE_CPU_SETS: OnceLock<HashMap<u32, Option<Vec<usize>>>> = OnceLock::new();
/// Get a deterministic CPU subset for a CUDA device, subdivided among ALL GPUs
/// sharing the same NUMA node (including those hidden by CUDA_VISIBLE_DEVICES).
///
/// # Algorithm
/// 1. Get PCI address + NUMA node for target device (CUDA driver API)
/// 2. Enumerate ALL GPUs on the system:
/// - Try NVML first (sees all GPUs, ignores CUDA_VISIBLE_DEVICES)
/// - Fall back to CUDA driver API (only sees visible devices)
/// 3. For each GPU, get its NUMA node via sysfs (PCI address → /sys/.../numa_node)
/// 4. Group GPUs by NUMA node
/// 5. Sort by PCI address within each group (deterministic)
/// 6. Get full CPU set for the node via topology
/// 7. Divide into N equal slices (N = GPUs on same node)
/// 8. Return the slice for the target device's position
///
/// # Example
/// System: 8 GPUs, 2 NUMA nodes, 4 GPUs per node.
/// CUDA_VISIBLE_DEVICES=0,1 (only 2 visible).
/// NVML sees all 8 → correctly subdivides into 4 slices per node.
///
/// Returns None if NUMA node can't be determined.
pub fn get_device_cpu_set(device_id: u32) -> Option<Vec<usize>> {
DEVICE_CPU_SETS
.get_or_init(compute_all_device_cpu_sets)
.get(&device_id)
.cloned()
.flatten()
}
fn compute_all_device_cpu_sets() -> HashMap<u32, Option<Vec<usize>>> {
let topology = match topology::get_numa_topology() {
Ok(t) => t,
Err(e) => {
tracing::warn!("Cannot subdivide CPU sets: {e}");
return HashMap::new();
}
};
// Get the target device's PCI address and NUMA node
let cuda_count = cuda_device::get_count().unwrap_or(0);
if cuda_count == 0 {
return HashMap::new();
}
// Build info for each visible CUDA device
let mut cuda_devices: Vec<(u32, String, Option<u32>)> = Vec::new();
for i in 0..cuda_count as u32 {
if let Some(pci) = get_pci_bus_address_from_cuda(i) {
let numa = read_numa_node_from_sysfs(&pci).map(|n| n.0);
cuda_devices.push((i, pci, numa));
}
}
// Enumerate ALL GPUs on the system (NVML preferred)
let all_gpus = enumerate_all_gpus();
// Group all GPUs by NUMA node
let mut node_groups: HashMap<u32, Vec<String>> = HashMap::new();
for gpu in &all_gpus {
if let Some(node) = gpu.numa_node {
node_groups
.entry(node)
.or_default()
.push(gpu.pci_address.clone());
}
}
// Sort each group by PCI address for deterministic ordering
for group in node_groups.values_mut() {
group.sort();
}
// For each CUDA device, find its position in its NUMA group and subdivide
let mut results = HashMap::new();
for (device_id, pci_addr, numa_node) in &cuda_devices {
let cpu_set = numa_node.and_then(|node| {
let group = node_groups.get(&node)?;
let position = group.iter().position(|addr| addr == pci_addr)?;
let all_cpus = topology.cpus_for_node(node)?;
if all_cpus.is_empty() || group.is_empty() {
return None;
}
// Divide CPUs into N equal slices
let n = group.len();
let chunk_size = all_cpus.len() / n;
if chunk_size == 0 {
// More GPUs than CPUs on this node — give all CPUs to everyone
return Some(all_cpus.to_vec());
}
let start = position * chunk_size;
let end = if position == n - 1 {
all_cpus.len() // last slice gets remainder
} else {
start + chunk_size
};
Some(all_cpus[start..end].to_vec())
});
results.insert(*device_id, cpu_set);
}
results
}
#[cfg(test)]
mod tests {
use super::*;
......@@ -345,14 +529,6 @@ mod tests {
assert_eq!(node2, node3);
}
#[test]
fn test_format_pci_bus_address() {
assert_eq!(format_pci_bus_address(0, 0, 0), "0000:00:00.0");
assert_eq!(format_pci_bus_address(0, 0x3b, 0), "0000:3b:00.0");
assert_eq!(format_pci_bus_address(0, 0xaf, 0), "0000:af:00.0");
assert_eq!(format_pci_bus_address(0x10, 0x1a, 0x03), "0010:1a:03.0");
}
#[test]
fn test_read_numa_node_from_sysfs_nonexistent() {
assert!(read_numa_node_from_sysfs("ffff:ff:ff.0").is_none());
......
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Minimal NVML FFI via dlopen.
//!
//! Dynamically loads `libnvidia-ml.so.1` to enumerate ALL GPUs on the system,
//! regardless of `CUDA_VISIBLE_DEVICES`. This is critical for CPU set subdivision:
//! when multiple GPUs share a NUMA node, we need to know about ALL siblings to
//! divide CPU cores fairly.
//!
//! If NVML is unavailable (e.g., in containers without the management library),
//! callers fall back to CUDA driver enumeration (which only sees visible devices).
use libloading::{Library, Symbol};
use std::sync::OnceLock;
/// PCI info struct matching NVML's `nvmlPciInfo_t`.
#[repr(C)]
struct NvmlPciInfo {
bus_id_legacy: [u8; 16], // "DDDD:BB:DD.F\0" (legacy, 16 chars)
domain: u32,
bus: u32,
device: u32,
pci_device_id: u32,
pci_subsystem_id: u32,
bus_id: [u8; 32], // "DDDD:BB:DD.F\0" (full, 32 chars)
}
/// GPU info from NVML enumeration.
#[derive(Debug, Clone)]
pub(crate) struct NvmlGpuInfo {
/// PCI bus address, e.g. "0000:3b:00.0"
pub pci_address: String,
}
// NVML return codes
const NVML_SUCCESS: u32 = 0;
/// Minimal NVML handle. Sees ALL GPUs regardless of CUDA_VISIBLE_DEVICES.
pub(crate) struct NvmlHandle {
_lib: Library,
device_get_count: unsafe extern "C" fn(*mut u32) -> u32,
device_get_handle_by_index: unsafe extern "C" fn(u32, *mut u64) -> u32,
device_get_pci_info: unsafe extern "C" fn(u64, *mut NvmlPciInfo) -> u32,
shutdown: unsafe extern "C" fn() -> u32,
}
// SAFETY: NVML functions are thread-safe per NVML documentation
unsafe impl Send for NvmlHandle {}
unsafe impl Sync for NvmlHandle {}
impl NvmlHandle {
/// Try to load NVML. Returns None if libnvidia-ml.so.1 is not available.
pub fn try_load() -> Option<Self> {
// SAFETY: We are loading a well-known system library and resolving documented
// NVML API symbols. The library is kept alive for the lifetime of NvmlHandle.
unsafe {
let lib = Library::new("libnvidia-ml.so.1").ok()?;
// Initialize NVML
let init: Symbol<unsafe extern "C" fn() -> u32> = lib.get(b"nvmlInit_v2\0").ok()?;
if init() != NVML_SUCCESS {
tracing::warn!("nvmlInit_v2 failed");
return None;
}
let device_get_count: Symbol<unsafe extern "C" fn(*mut u32) -> u32> =
lib.get(b"nvmlDeviceGetCount_v2\0").ok()?;
let device_get_handle_by_index: Symbol<unsafe extern "C" fn(u32, *mut u64) -> u32> =
lib.get(b"nvmlDeviceGetHandleByIndex_v2\0").ok()?;
let device_get_pci_info: Symbol<unsafe extern "C" fn(u64, *mut NvmlPciInfo) -> u32> =
lib.get(b"nvmlDeviceGetPciInfo_v3\0").ok()?;
let shutdown: Symbol<unsafe extern "C" fn() -> u32> =
lib.get(b"nvmlShutdown\0").ok()?;
Some(Self {
device_get_count: *device_get_count,
device_get_handle_by_index: *device_get_handle_by_index,
device_get_pci_info: *device_get_pci_info,
shutdown: *shutdown,
_lib: lib,
})
}
}
/// Enumerate ALL GPUs on the system with PCI addresses.
pub fn enumerate_gpus(&self) -> Vec<NvmlGpuInfo> {
let mut count: u32 = 0;
// SAFETY: NVML API call with valid pointer
unsafe {
if (self.device_get_count)(&mut count) != NVML_SUCCESS {
tracing::warn!("nvmlDeviceGetCount_v2 failed");
return Vec::new();
}
}
let mut gpus = Vec::with_capacity(count as usize);
for i in 0..count {
let mut handle: u64 = 0;
// SAFETY: NVML API call with valid index and pointer
unsafe {
if (self.device_get_handle_by_index)(i, &mut handle) != NVML_SUCCESS {
tracing::warn!("nvmlDeviceGetHandleByIndex_v2 failed for index {i}");
continue;
}
}
let mut pci_info = std::mem::MaybeUninit::<NvmlPciInfo>::zeroed();
// SAFETY: NVML API call with valid handle and pointer to zeroed struct
unsafe {
if (self.device_get_pci_info)(handle, pci_info.as_mut_ptr()) != NVML_SUCCESS {
tracing::warn!("nvmlDeviceGetPciInfo_v3 failed for index {i}");
continue;
}
let pci_info = pci_info.assume_init();
// Parse bus_id field: "DDDD:BB:DD.F\0" padded with zeros
let bus_id = &pci_info.bus_id;
let len = bus_id.iter().position(|&b| b == 0).unwrap_or(bus_id.len());
let pci_address = std::str::from_utf8(&bus_id[..len])
.unwrap_or("")
.to_lowercase();
if !pci_address.is_empty() {
gpus.push(NvmlGpuInfo { pci_address });
}
}
}
gpus
}
}
impl Drop for NvmlHandle {
fn drop(&mut self) {
// SAFETY: Matching nvmlInit_v2 call from try_load
unsafe {
(self.shutdown)();
}
}
}
/// Cached NVML load attempt (None = tried and failed).
static NVML: OnceLock<Option<NvmlHandle>> = OnceLock::new();
/// Get a reference to the global NVML handle, if available.
pub(crate) fn try_nvml() -> Option<&'static NvmlHandle> {
NVML.get_or_init(|| {
let handle = NvmlHandle::try_load();
if handle.is_some() {
tracing::debug!("NVML loaded successfully");
} else {
tracing::debug!("NVML not available, will fall back to CUDA driver enumeration");
}
handle
})
.as_ref()
}
......@@ -127,7 +127,7 @@ impl PinnedStorage {
// Try NUMA-aware allocation unless explicitly disabled
#[cfg(target_os = "linux")]
let numa_ptr = if let Some(gpu_id) = device_id {
if !super::numa::is_numa_disabled() {
if super::numa::is_numa_enabled() {
match super::numa::worker_pool::NumaWorkerPool::global()
.allocate_pinned_for_gpu(len, gpu_id)
{
......
......@@ -47,9 +47,9 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arc-swap"
version = "1.9.0"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6"
checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207"
dependencies = [
"rustversion",
]
......@@ -979,9 +979,9 @@ dependencies = [
[[package]]
name = "fastrand"
version = "2.3.0"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
[[package]]
name = "fiat-crypto"
......
......@@ -955,7 +955,7 @@ pub fn init() {
}
#[cfg(feature = "tokio-console")]
fn setup_logging() {
fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
let tokio_console_layer = console_subscriber::ConsoleLayer::builder()
.with_default_env()
.server_addr(([0, 0, 0, 0], console_subscriber::Server::DEFAULT_PORT))
......@@ -973,6 +973,7 @@ fn setup_logging() {
.with(l)
.with(tokio_console_layer.with_filter(tokio_console_target))
.init();
Ok(())
}
#[cfg(not(feature = "tokio-console"))]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment