Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
008683d6
Unverified
Commit
008683d6
authored
Apr 08, 2026
by
Ryan Olson
Committed by
GitHub
Apr 08, 2026
Browse files
feat: adding kvbm-engine (#6773)
Signed-off-by:
Ryan Olson
<
rolson@nvidia.com
>
parent
cf79c4fc
Changes
161
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
413 additions
and
237 deletions
+413
-237
lib/kvbm-physical/src/transfer/context.rs
lib/kvbm-physical/src/transfer/context.rs
+1
-1
lib/kvbm-physical/src/transfer/notifications/mod.rs
lib/kvbm-physical/src/transfer/notifications/mod.rs
+1
-1
lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
+2
-2
lib/kvbm-physical/src/transfer/notifications/notification.rs
lib/kvbm-physical/src/transfer/notifications/notification.rs
+1
-1
lib/kvbm-physical/src/transfer/tests/local_transfers.rs
lib/kvbm-physical/src/transfer/tests/local_transfers.rs
+0
-1
lib/kvbm-physical/src/transfer/tests/mod.rs
lib/kvbm-physical/src/transfer/tests/mod.rs
+0
-171
lib/llm/Cargo.toml
lib/llm/Cargo.toml
+4
-2
lib/llm/src/block_manager/controller.rs
lib/llm/src/block_manager/controller.rs
+6
-3
lib/llm/src/block_manager/distributed.rs
lib/llm/src/block_manager/distributed.rs
+1
-4
lib/llm/src/block_manager/numa_allocator.rs
lib/llm/src/block_manager/numa_allocator.rs
+2
-5
lib/llm/src/cuda.rs
lib/llm/src/cuda.rs
+2
-2
lib/memory/.claude/settings.local.json
lib/memory/.claude/settings.local.json
+13
-0
lib/memory/Cargo.toml
lib/memory/Cargo.toml
+1
-0
lib/memory/bin/validate_numa_placement.rs
lib/memory/bin/validate_numa_placement.rs
+1
-1
lib/memory/src/lib.rs
lib/memory/src/lib.rs
+1
-1
lib/memory/src/numa/mod.rs
lib/memory/src/numa/mod.rs
+212
-36
lib/memory/src/numa/nvml.rs
lib/memory/src/numa/nvml.rs
+158
-0
lib/memory/src/pinned.rs
lib/memory/src/pinned.rs
+1
-1
lib/runtime/examples/Cargo.lock
lib/runtime/examples/Cargo.lock
+4
-4
lib/runtime/src/logging.rs
lib/runtime/src/logging.rs
+2
-1
No files found.
lib/kvbm-physical/src/transfer/context.rs
View file @
008683d6
...
...
@@ -14,7 +14,7 @@ use uuid::Uuid;
use
dynamo_memory
::
CudaMemPool
;
use
dynamo_memory
::
nixl
::{
NixlAgent
,
NixlBackendConfig
,
XferRequest
};
use
velo
_events
::
EventManager
;
use
velo
::
EventManager
;
use
crate
::
manager
::
TransferManager
;
...
...
lib/kvbm-physical/src/transfer/notifications/mod.rs
View file @
008683d6
...
...
@@ -15,7 +15,7 @@ use tokio::sync::mpsc;
use
tokio
::
time
::
interval
;
use
tracing
::{
error
,
warn
};
use
uuid
::
Uuid
;
use
velo
_events
::{
EventHandle
,
EventManager
};
use
velo
::{
EventHandle
,
EventManager
};
pub
mod
cuda_event
;
pub
mod
nixl_events
;
...
...
lib/kvbm-physical/src/transfer/notifications/nixl_events.rs
View file @
008683d6
...
...
@@ -12,7 +12,7 @@ use tokio::sync::mpsc;
use
tokio
::
time
::
interval
;
use
tracing
::{
error
,
warn
};
use
uuid
::
Uuid
;
use
velo
_events
::{
EventHandle
,
EventManager
};
use
velo
::{
EventHandle
,
EventManager
};
/// Registration message for NIXL notification-based transfer completion.
pub
struct
RegisterNixlNotification
{
...
...
@@ -276,7 +276,7 @@ mod tests {
use
std
::
collections
::
VecDeque
;
use
std
::
sync
::
Mutex
;
use
tokio
::
task
::
yield_now
;
use
velo
_events
::
EventStatus
;
use
velo
::
EventStatus
;
// ── Mock notification source ────────────────────────────────────
...
...
lib/kvbm-physical/src/transfer/notifications/notification.rs
View file @
008683d6
...
...
@@ -10,7 +10,7 @@ use std::{
sync
::
Arc
,
task
::{
Context
,
Poll
},
};
use
velo
_events
::{
Event
,
EventAwaiter
,
EventManager
};
use
velo
::{
Event
,
EventAwaiter
,
EventManager
};
pub
enum
TransferAwaiter
{
Local
(
EventAwaiter
),
...
...
lib/kvbm-physical/src/transfer/tests/local_transfers.rs
View file @
008683d6
...
...
@@ -9,7 +9,6 @@
//! - Different transfer strategies (Memcpy, CUDA H2D/D2H)
use
super
::
*
;
use
crate
::
transfer
::
TransferCapabilities
;
use
crate
::
transfer
::
executor
::
TransferOptionsInternal
;
use
crate
::
transfer
::
executor
::
execute_transfer
;
use
crate
::
transfer
::{
can_use_whole_block_transfer
,
validate_layout_compatibility
};
...
...
lib/kvbm-physical/src/transfer/tests/mod.rs
View file @
008683d6
...
...
@@ -68,13 +68,8 @@ use crate::{
},
};
use
anyhow
::
Result
;
use
cudarc
::
driver
::
sys
::
CUdevice_attribute_enum
;
use
cudarc
::
driver
::{
CudaContext
,
CudaStream
,
LaunchConfig
,
PushKernelArg
};
use
cudarc
::
nvrtc
::{
CompileOptions
,
compile_ptx_with_opts
};
use
std
::
collections
::
HashMap
;
use
std
::
ops
::
Range
;
use
std
::
sync
::{
Arc
,
OnceLock
};
use
std
::
time
::{
Duration
,
Instant
};
/// Layout kind for parameterized testing.
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq)]
...
...
@@ -404,169 +399,3 @@ pub fn verify_guard_blocks_unchanged(
Ok
(())
}
/// CUDA sleep kernel source code.
const
SLEEP_KERNEL_SRC
:
&
str
=
r#"
extern "C" __global__ void sleep_kernel(unsigned long long min_cycles) {
const unsigned long long start = clock64();
while ((clock64() - start) < min_cycles) {
asm volatile("");
}
}
"#
;
/// A reusable CUDA sleep utility for tests.
///
/// This struct provides a simple interface to execute GPU sleep operations
/// with calibrated timing. It compiles the sleep kernel once per CUDA context
/// and caches the calibration for reuse.
///
/// The calibration is conservative (prefers longer sleep durations over shorter)
/// to ensure minimum sleep times are met.
pub
struct
CudaSleep
{
function
:
cudarc
::
driver
::
CudaFunction
,
cycles_per_ms
:
f64
,
}
impl
CudaSleep
{
/// Get or create a CudaSleep instance for the given CUDA context.
///
/// This function uses lazy initialization and caches instances per device ID.
/// The first call for each device will compile the kernel and run calibration.
///
/// # Arguments
/// * `cuda_ctx` - The CUDA context to use
///
/// # Returns
/// A shared reference to the CudaSleep instance for this context's device.
pub
fn
for_context
(
cuda_ctx
:
&
Arc
<
CudaContext
>
)
->
Result
<
Arc
<
Self
>>
{
static
INSTANCES
:
OnceLock
<
parking_lot
::
Mutex
<
HashMap
<
usize
,
Arc
<
CudaSleep
>>>>
=
OnceLock
::
new
();
let
instances
=
INSTANCES
.get_or_init
(||
parking_lot
::
Mutex
::
new
(
HashMap
::
new
()));
let
device_ordinal
=
cuda_ctx
.ordinal
();
// Fast path: check if instance already exists
{
let
instances_guard
=
instances
.lock
();
if
let
Some
(
instance
)
=
instances_guard
.get
(
&
device_ordinal
)
{
return
Ok
(
Arc
::
clone
(
instance
));
}
}
// Slow path: create new instance with calibration
let
instance
=
Arc
::
new
(
Self
::
new
(
cuda_ctx
)
?
);
// Store in cache
let
mut
instances_guard
=
instances
.lock
();
instances_guard
.entry
(
device_ordinal
)
.or_insert_with
(||
Arc
::
clone
(
&
instance
));
Ok
(
instance
)
}
/// Create a new CudaSleep instance with calibration.
///
/// This compiles the sleep kernel and runs a calibration loop to determine
/// the relationship between clock cycles and wall-clock time.
fn
new
(
cuda_ctx
:
&
Arc
<
CudaContext
>
)
->
Result
<
Self
>
{
// Get device compute capability
let
major
=
cuda_ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
)
?
;
let
minor
=
cuda_ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
)
?
;
// Compile PTX for this device
let
mut
compile_opts
=
CompileOptions
{
name
:
Some
(
"sleep_kernel.cu"
.into
()),
..
Default
::
default
()
};
compile_opts
.options
.push
(
format!
(
"--gpu-architecture=compute_{}{}"
,
major
,
minor
));
let
ptx
=
compile_ptx_with_opts
(
SLEEP_KERNEL_SRC
,
compile_opts
)
?
;
let
module
=
cuda_ctx
.load_module
(
ptx
)
?
;
let
function
=
module
.load_function
(
"sleep_kernel"
)
?
;
// Get device clock rate
let
clock_rate_khz
=
cuda_ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_CLOCK_RATE
)
?
as
u64
;
// Create a temporary stream for calibration
let
stream
=
cuda_ctx
.new_stream
()
?
;
// Warm up to absorb JIT overhead
let
warm_cycles
=
clock_rate_khz
.saturating_mul
(
10
)
.max
(
1
);
Self
::
launch_kernel
(
&
function
,
&
stream
,
warm_cycles
)
?
;
stream
.synchronize
()
?
;
// Run calibration loop
let
desired_delay
=
Duration
::
from_millis
(
600
);
let
mut
target_cycles
=
clock_rate_khz
.saturating_mul
(
50
)
.max
(
1
);
// ~50ms starting point
let
mut
actual_duration
=
Duration
::
ZERO
;
for
_
in
0
..
8
{
let
start
=
Instant
::
now
();
Self
::
launch_kernel
(
&
function
,
&
stream
,
target_cycles
)
?
;
stream
.synchronize
()
?
;
actual_duration
=
start
.elapsed
();
if
actual_duration
>=
desired_delay
{
break
;
}
target_cycles
=
target_cycles
.saturating_mul
(
2
);
}
// Calculate cycles per millisecond with conservative 20% margin
// (prefer longer sleeps over shorter)
let
cycles_per_ms
=
if
actual_duration
.as_millis
()
>
0
{
(
target_cycles
as
f64
/
actual_duration
.as_millis
()
as
f64
)
*
1.2
}
else
{
clock_rate_khz
as
f64
// Fallback to clock rate
};
Ok
(
Self
{
function
,
cycles_per_ms
,
})
}
/// Launch the sleep kernel with the specified number of cycles.
fn
launch_kernel
(
function
:
&
cudarc
::
driver
::
CudaFunction
,
stream
:
&
Arc
<
CudaStream
>
,
cycles
:
u64
,
)
->
Result
<
()
>
{
let
launch_cfg
=
LaunchConfig
{
grid_dim
:
(
1
,
1
,
1
),
block_dim
:
(
1
,
1
,
1
),
shared_mem_bytes
:
0
,
};
let
mut
launch
=
stream
.launch_builder
(
function
);
unsafe
{
launch
.arg
(
&
cycles
);
launch
.launch
(
launch_cfg
)
?
;
}
Ok
(())
}
/// Launch a sleep operation on the given stream.
///
/// This queues a GPU kernel that will sleep for approximately the specified
/// duration. The sleep is conservative and may take longer than requested.
///
/// # Arguments
/// * `duration` - The minimum duration to sleep
/// * `stream` - The CUDA stream to launch the kernel on
///
/// # Returns
/// Ok(()) if the kernel was successfully queued
pub
fn
launch
(
&
self
,
duration
:
Duration
,
stream
:
&
Arc
<
CudaStream
>
)
->
Result
<
()
>
{
let
target_cycles
=
(
duration
.as_millis
()
as
f64
*
self
.cycles_per_ms
)
as
u64
;
Self
::
launch_kernel
(
&
self
.function
,
stream
,
target_cycles
)
}
}
lib/llm/Cargo.toml
View file @
008683d6
...
...
@@ -48,6 +48,7 @@ required-features = ["kv-router-stress"]
[dependencies]
# repo
dynamo-config
=
{
workspace
=
true
}
dynamo-kv-router
=
{
workspace
=
true
,
features
=
[
"metrics"
,
"runtime-protocols"
]
}
dynamo-memory
=
{
workspace
=
true
}
dynamo-mocker
=
{
workspace
=
true
}
...
...
@@ -64,6 +65,7 @@ async-trait = { workspace = true }
async-nats
=
{
workspace
=
true
}
bytes
=
{
workspace
=
true
}
chrono
=
{
workspace
=
true
}
dashmap
=
{
workspace
=
true
}
derive_builder
=
{
workspace
=
true
}
either
=
{
workspace
=
true
}
futures
=
{
workspace
=
true
}
...
...
@@ -89,14 +91,14 @@ uuid = { workspace = true }
xxhash-rust
=
{
workspace
=
true
}
modelexpress-client
=
{
workspace
=
true
}
modelexpress-common
=
{
workspace
=
true
}
bitflags
=
{
version
=
"2.4"
,
features
=
["serde"]
}
blake3
=
{
version
=
"1.8"
,
features
=
[
"mmap"
,
"rayon"
]
}
bytemuck
=
"1.22"
# candle-core = { version = "0.9.1" }
derive-getters
=
"0.5"
offset-allocator
=
"0.2"
rayon
=
"1"
dashmap
=
{
version
=
"5.5.3"
}
bincode
=
{
version
=
"2.0.1"
,
features
=
[
"serde"
,
"derive"
]
}
# lora
...
...
lib/llm/src/block_manager/controller.rs
View file @
008683d6
...
...
@@ -98,7 +98,8 @@ pub enum ResetResponse {
mod
tests
{
use
crate
::
tokens
::
Tokens
;
use
super
::
super
::
tests
::
create_reference_block_manager_with_counts
;
use
super
::
super
::
ReferenceBlockManager
;
use
super
::
super
::
tests
::
create_reference_block_manager_config_with_counts
;
use
super
::
*
;
#[tokio::test]
...
...
@@ -110,9 +111,11 @@ mod tests {
.await
.unwrap
();
let
worker_id
=
drt
.connection_id
();
let
worker_id
=
drt
.connection_id
()
as
i64
;
let
block_manager
=
create_reference_block_manager_with_counts
(
8
,
16
,
0
)
.await
;
let
config
=
create_reference_block_manager_config_with_counts
(
8
,
16
,
0
);
let
block_manager
:
ReferenceBlockManager
=
ReferenceBlockManager
::
new
(
config
)
.await
.unwrap
();
let
component
=
drt
.namespace
(
"test-kvbm"
)
...
...
lib/llm/src/block_manager/distributed.rs
View file @
008683d6
...
...
@@ -60,10 +60,7 @@ mod tests {
use
anyhow
::
Result
;
use
rstest
::
*
;
use
std
::
sync
::{
Arc
,
atomic
::{
AtomicUsize
,
Ordering
},
};
use
std
::
sync
::
Arc
;
use
tokio_util
::
sync
::
CancellationToken
;
use
dynamo_runtime
::
logging
::
init
as
init_logging
;
...
...
lib/llm/src/block_manager/numa_allocator.rs
View file @
008683d6
...
...
@@ -21,13 +21,10 @@ pub use dynamo_memory::numa::*;
)]
pub
fn
is_numa_enabled
()
->
bool
{
// Global kill switch always wins
if
is_numa_disabled
()
{
if
dynamo_memory
::
numa
::
is_numa_disabled
()
{
return
false
;
}
matches!
(
std
::
env
::
var
(
"DYN_KVBM_ENABLE_NUMA"
)
.as_deref
(),
Ok
(
"1"
|
"true"
|
"yes"
)
)
dynamo_config
::
env_is_truthy
(
"DYN_KVBM_ENABLE_NUMA"
)
}
#[cfg(test)]
...
...
lib/llm/src/cuda.rs
View file @
008683d6
...
...
@@ -84,7 +84,7 @@ impl DynamoCudaContextGuard {
/// The caller must ensure the context is valid.
pub
unsafe
fn
new
(
context
:
CUcontext
)
->
Pin
<
Box
<
Self
>>
{
// Push the context onto the CUDA context stack
let
result
=
cuCtxPushCurrent_v2
(
context
);
let
result
=
unsafe
{
cuCtxPushCurrent_v2
(
context
)
}
;
if
result
!=
cudaError_enum
::
CUDA_SUCCESS
{
panic!
(
"Failed to push CUDA context: {:?}"
,
result
);
}
...
...
@@ -192,7 +192,7 @@ impl DynamoCudaContextProvider for CudaContext {
impl
DynamoCudaContextProvider
for
CudaStream
{
unsafe
fn
cu_context
(
&
self
)
->
cudarc
::
driver
::
sys
::
CUcontext
{
self
.context
()
.cu_context
()
unsafe
{
self
.context
()
.cu_context
()
}
}
}
...
...
lib/memory/.claude/settings.local.json
0 → 100644
View file @
008683d6
{
"permissions"
:
{
"allow"
:
[
"Bash(ls:*)"
,
"Bash(rustc --print sysroot:*)"
,
"Bash(cargo tree:*)"
,
"Bash(cargo metadata:*)"
,
"Bash(env:*)"
,
"Bash(ldd:*)"
,
"Bash(cargo test:*)"
]
}
}
lib/memory/Cargo.toml
View file @
008683d6
...
...
@@ -31,6 +31,7 @@ thiserror = { workspace = true }
tracing
=
{
workspace
=
true
}
libc
=
{
version
=
"0.2"
}
libloading
=
"0.8"
nix
=
{
version
=
"0.30"
,
features
=
["fs"]
}
offset-allocator
=
"0.2"
...
...
lib/memory/bin/validate_numa_placement.rs
View file @
008683d6
...
...
@@ -142,7 +142,7 @@ fn main() {
println!
(
"======================="
);
println!
(
"GPUs: {gpus:?}"
);
println!
(
"Alloc size: {size_mib} MiB ({alloc_size} bytes)"
);
println!
(
"NUMA
dis
abled: {}"
,
dynamo_memory
::
is_numa_
dis
abled
());
println!
(
"NUMA
en
abled:
{}"
,
dynamo_memory
::
is_numa_
en
abled
());
println!
();
// Phase 1: Show GPU-to-NUMA mapping
...
...
lib/memory/src/lib.rs
View file @
008683d6
...
...
@@ -43,7 +43,7 @@ pub use device::DeviceStorage;
pub
use
disk
::
DiskStorage
;
pub
use
external
::
ExternalDeviceMemory
;
#[cfg(target_os
=
"linux"
)]
pub
use
numa
::{
NumaNode
,
is_numa_disabled
};
pub
use
numa
::{
NumaNode
,
is_numa_disabled
,
is_numa_enabled
};
pub
use
offset
::
OffsetBuffer
;
pub
use
pinned
::
PinnedStorage
;
pub
use
pool
::{
CudaMemPool
,
CudaMemPoolBuilder
};
...
...
lib/memory/src/numa/mod.rs
View file @
008683d6
...
...
@@ -25,10 +25,11 @@
//! places pages on the correct node. If the GPU's NUMA node cannot be
//! determined, allocation falls back to the non-NUMA path transparently.
pub
(
crate
)
mod
nvml
;
pub
mod
topology
;
pub
mod
worker_pool
;
use
cudarc
::
driver
::
sys
::
CUdevice_attribute_enum
;
use
cudarc
::
driver
::
{
result
::
device
as
cuda_device
,
sys
as
cuda_sys
}
;
use
nix
::
libc
;
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
collections
::
HashMap
;
...
...
@@ -44,8 +45,13 @@ static NUMA_NODE_CACHE: OnceLock<Mutex<HashMap<String, Option<NumaNode>>>> = Onc
///
/// NUMA-aware allocation is enabled by default. Set `DYN_MEMORY_DISABLE_NUMA=1`
/// (or any truthy value) to disable it.
pub
fn
is_numa_enabled
()
->
bool
{
!
crate
::
env_is_truthy
(
"DYN_MEMORY_DISABLE_NUMA"
)
}
/// Convenience inverse of [`is_numa_enabled`].
pub
fn
is_numa_disabled
()
->
bool
{
crate
::
env_is_truthy
(
"DYN_MEMORY_DISABLE_NUMA"
)
!
is_numa_enabled
(
)
}
/// Represents a NUMA node identifier.
...
...
@@ -99,32 +105,6 @@ pub fn get_current_cpu_numa_node() -> NumaNode {
}
}
/// Format a PCI bus address from domain, bus, and device IDs.
///
/// Returns a string in the format `"DDDD:BB:DD.0"` suitable for sysfs lookups.
fn
format_pci_bus_address
(
domain
:
i32
,
bus
:
i32
,
device
:
i32
)
->
String
{
format!
(
"{:04x}:{:02x}:{:02x}.0"
,
domain
,
bus
,
device
)
}
/// Query the PCI bus address for a CUDA device from the CUDA driver API.
///
/// Uses `CudaContext::attribute()` to read PCI domain, bus, and device IDs.
/// This transparently handles `CUDA_VISIBLE_DEVICES` remapping since
/// `CudaContext::new(ordinal)` operates on the process-local device index.
fn
get_pci_bus_address_from_cuda
(
device_id
:
u32
)
->
Option
<
String
>
{
let
ctx
=
crate
::
device
::
cuda_context
(
device_id
)
.ok
()
?
;
let
domain
=
ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID
)
.ok
()
?
;
let
bus
=
ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID
)
.ok
()
?
;
let
device
=
ctx
.attribute
(
CUdevice_attribute_enum
::
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID
)
.ok
()
?
;
Some
(
format_pci_bus_address
(
domain
,
bus
,
device
))
}
/// Read the NUMA node for a PCI device from sysfs.
///
/// Reads `/sys/bus/pci/devices/<pci_address>/numa_node`. Returns `None` if the
...
...
@@ -274,6 +254,210 @@ pub fn pin_thread_to_numa_node(node: NumaNode) -> Result<(), String> {
Ok
(())
}
/// Get PCI bus address for a CUDA device via the CUDA driver API.
///
/// Returns a normalized PCI address string like "0000:3b:00.0".
/// The device_id here is a CUDA ordinal (affected by CUDA_VISIBLE_DEVICES).
fn
get_pci_bus_address_from_cuda
(
device_id
:
u32
)
->
Option
<
String
>
{
// SAFETY: We're calling CUDA driver API functions with valid device ordinals.
// cuDeviceGet and get_attribute are safe as long as CUDA is initialized
// (which CudaContext::new handles).
unsafe
{
let
mut
dev
=
std
::
mem
::
MaybeUninit
::
uninit
();
if
cuda_sys
::
cuDeviceGet
(
dev
.as_mut_ptr
(),
device_id
as
i32
)
.result
()
.is_err
()
{
return
None
;
}
let
dev
=
dev
.assume_init
();
let
domain
=
cuda_device
::
get_attribute
(
dev
,
cuda_sys
::
CUdevice_attribute
::
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID
,
)
.ok
()
?
;
let
bus
=
cuda_device
::
get_attribute
(
dev
,
cuda_sys
::
CUdevice_attribute
::
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID
,
)
.ok
()
?
;
let
device
=
cuda_device
::
get_attribute
(
dev
,
cuda_sys
::
CUdevice_attribute
::
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID
,
)
.ok
()
?
;
Some
(
format!
(
"{:04x}:{:02x}:{:02x}.0"
,
domain
,
bus
,
device
))
}
}
/// GPU info with PCI address and NUMA node, used for CPU set subdivision.
#[derive(Debug,
Clone)]
struct
GpuTopoInfo
{
pci_address
:
String
,
numa_node
:
Option
<
u32
>
,
}
/// Enumerate all GPUs visible to CUDA with their PCI addresses and NUMA nodes.
fn
enumerate_cuda_gpus
()
->
Vec
<
GpuTopoInfo
>
{
let
count
=
match
cuda_device
::
get_count
()
{
Ok
(
c
)
=>
c
,
Err
(
_
)
=>
return
Vec
::
new
(),
};
(
0
..
count
as
u32
)
.filter_map
(|
i
|
{
let
pci
=
get_pci_bus_address_from_cuda
(
i
)
?
;
let
numa
=
read_numa_node_from_sysfs
(
&
pci
)
.map
(|
n
|
n
.0
);
Some
(
GpuTopoInfo
{
pci_address
:
pci
,
numa_node
:
numa
,
})
})
.collect
()
}
/// Enumerate all GPUs on the system, preferring NVML (sees all GPUs)
/// over CUDA driver (only sees CUDA_VISIBLE_DEVICES).
fn
enumerate_all_gpus
()
->
Vec
<
GpuTopoInfo
>
{
// Try NVML first — it sees all GPUs regardless of CUDA_VISIBLE_DEVICES
if
let
Some
(
nvml
)
=
nvml
::
try_nvml
()
{
let
nvml_gpus
=
nvml
.enumerate_gpus
();
if
!
nvml_gpus
.is_empty
()
{
tracing
::
debug!
(
"NVML enumerated {} GPUs (ignoring CUDA_VISIBLE_DEVICES)"
,
nvml_gpus
.len
()
);
return
nvml_gpus
.into_iter
()
.map
(|
g
|
{
let
numa
=
read_numa_node_from_sysfs
(
&
g
.pci_address
)
.map
(|
n
|
n
.0
);
GpuTopoInfo
{
pci_address
:
g
.pci_address
,
numa_node
:
numa
,
}
})
.collect
();
}
}
// Fallback: enumerate via CUDA driver (may miss hidden devices)
tracing
::
debug!
(
"Falling back to CUDA driver GPU enumeration"
);
enumerate_cuda_gpus
()
}
/// Cached CPU set results per CUDA device ordinal.
static
DEVICE_CPU_SETS
:
OnceLock
<
HashMap
<
u32
,
Option
<
Vec
<
usize
>>>>
=
OnceLock
::
new
();
/// Get a deterministic CPU subset for a CUDA device, subdivided among ALL GPUs
/// sharing the same NUMA node (including those hidden by CUDA_VISIBLE_DEVICES).
///
/// # Algorithm
/// 1. Get PCI address + NUMA node for target device (CUDA driver API)
/// 2. Enumerate ALL GPUs on the system:
/// - Try NVML first (sees all GPUs, ignores CUDA_VISIBLE_DEVICES)
/// - Fall back to CUDA driver API (only sees visible devices)
/// 3. For each GPU, get its NUMA node via sysfs (PCI address → /sys/.../numa_node)
/// 4. Group GPUs by NUMA node
/// 5. Sort by PCI address within each group (deterministic)
/// 6. Get full CPU set for the node via topology
/// 7. Divide into N equal slices (N = GPUs on same node)
/// 8. Return the slice for the target device's position
///
/// # Example
/// System: 8 GPUs, 2 NUMA nodes, 4 GPUs per node.
/// CUDA_VISIBLE_DEVICES=0,1 (only 2 visible).
/// NVML sees all 8 → correctly subdivides into 4 slices per node.
///
/// Returns None if NUMA node can't be determined.
pub
fn
get_device_cpu_set
(
device_id
:
u32
)
->
Option
<
Vec
<
usize
>>
{
DEVICE_CPU_SETS
.get_or_init
(
compute_all_device_cpu_sets
)
.get
(
&
device_id
)
.cloned
()
.flatten
()
}
fn
compute_all_device_cpu_sets
()
->
HashMap
<
u32
,
Option
<
Vec
<
usize
>>>
{
let
topology
=
match
topology
::
get_numa_topology
()
{
Ok
(
t
)
=>
t
,
Err
(
e
)
=>
{
tracing
::
warn!
(
"Cannot subdivide CPU sets: {e}"
);
return
HashMap
::
new
();
}
};
// Get the target device's PCI address and NUMA node
let
cuda_count
=
cuda_device
::
get_count
()
.unwrap_or
(
0
);
if
cuda_count
==
0
{
return
HashMap
::
new
();
}
// Build info for each visible CUDA device
let
mut
cuda_devices
:
Vec
<
(
u32
,
String
,
Option
<
u32
>
)
>
=
Vec
::
new
();
for
i
in
0
..
cuda_count
as
u32
{
if
let
Some
(
pci
)
=
get_pci_bus_address_from_cuda
(
i
)
{
let
numa
=
read_numa_node_from_sysfs
(
&
pci
)
.map
(|
n
|
n
.0
);
cuda_devices
.push
((
i
,
pci
,
numa
));
}
}
// Enumerate ALL GPUs on the system (NVML preferred)
let
all_gpus
=
enumerate_all_gpus
();
// Group all GPUs by NUMA node
let
mut
node_groups
:
HashMap
<
u32
,
Vec
<
String
>>
=
HashMap
::
new
();
for
gpu
in
&
all_gpus
{
if
let
Some
(
node
)
=
gpu
.numa_node
{
node_groups
.entry
(
node
)
.or_default
()
.push
(
gpu
.pci_address
.clone
());
}
}
// Sort each group by PCI address for deterministic ordering
for
group
in
node_groups
.values_mut
()
{
group
.sort
();
}
// For each CUDA device, find its position in its NUMA group and subdivide
let
mut
results
=
HashMap
::
new
();
for
(
device_id
,
pci_addr
,
numa_node
)
in
&
cuda_devices
{
let
cpu_set
=
numa_node
.and_then
(|
node
|
{
let
group
=
node_groups
.get
(
&
node
)
?
;
let
position
=
group
.iter
()
.position
(|
addr
|
addr
==
pci_addr
)
?
;
let
all_cpus
=
topology
.cpus_for_node
(
node
)
?
;
if
all_cpus
.is_empty
()
||
group
.is_empty
()
{
return
None
;
}
// Divide CPUs into N equal slices
let
n
=
group
.len
();
let
chunk_size
=
all_cpus
.len
()
/
n
;
if
chunk_size
==
0
{
// More GPUs than CPUs on this node — give all CPUs to everyone
return
Some
(
all_cpus
.to_vec
());
}
let
start
=
position
*
chunk_size
;
let
end
=
if
position
==
n
-
1
{
all_cpus
.len
()
// last slice gets remainder
}
else
{
start
+
chunk_size
};
Some
(
all_cpus
[
start
..
end
]
.to_vec
())
});
results
.insert
(
*
device_id
,
cpu_set
);
}
results
}
#[cfg(test)]
mod
tests
{
use
super
::
*
;
...
...
@@ -345,14 +529,6 @@ mod tests {
assert_eq!
(
node2
,
node3
);
}
#[test]
fn
test_format_pci_bus_address
()
{
assert_eq!
(
format_pci_bus_address
(
0
,
0
,
0
),
"0000:00:00.0"
);
assert_eq!
(
format_pci_bus_address
(
0
,
0x3b
,
0
),
"0000:3b:00.0"
);
assert_eq!
(
format_pci_bus_address
(
0
,
0xaf
,
0
),
"0000:af:00.0"
);
assert_eq!
(
format_pci_bus_address
(
0x10
,
0x1a
,
0x03
),
"0010:1a:03.0"
);
}
#[test]
fn
test_read_numa_node_from_sysfs_nonexistent
()
{
assert
!
(
read_numa_node_from_sysfs
(
"ffff:ff:ff.0"
)
.is_none
());
...
...
lib/memory/src/numa/nvml.rs
0 → 100644
View file @
008683d6
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Minimal NVML FFI via dlopen.
//!
//! Dynamically loads `libnvidia-ml.so.1` to enumerate ALL GPUs on the system,
//! regardless of `CUDA_VISIBLE_DEVICES`. This is critical for CPU set subdivision:
//! when multiple GPUs share a NUMA node, we need to know about ALL siblings to
//! divide CPU cores fairly.
//!
//! If NVML is unavailable (e.g., in containers without the management library),
//! callers fall back to CUDA driver enumeration (which only sees visible devices).
use
libloading
::{
Library
,
Symbol
};
use
std
::
sync
::
OnceLock
;
/// PCI info struct matching NVML's `nvmlPciInfo_t`.
#[repr(C)]
struct
NvmlPciInfo
{
bus_id_legacy
:
[
u8
;
16
],
// "DDDD:BB:DD.F\0" (legacy, 16 chars)
domain
:
u32
,
bus
:
u32
,
device
:
u32
,
pci_device_id
:
u32
,
pci_subsystem_id
:
u32
,
bus_id
:
[
u8
;
32
],
// "DDDD:BB:DD.F\0" (full, 32 chars)
}
/// GPU info from NVML enumeration.
#[derive(Debug,
Clone)]
pub
(
crate
)
struct
NvmlGpuInfo
{
/// PCI bus address, e.g. "0000:3b:00.0"
pub
pci_address
:
String
,
}
// NVML return codes
const
NVML_SUCCESS
:
u32
=
0
;
/// Minimal NVML handle. Sees ALL GPUs regardless of CUDA_VISIBLE_DEVICES.
pub
(
crate
)
struct
NvmlHandle
{
_
lib
:
Library
,
device_get_count
:
unsafe
extern
"C"
fn
(
*
mut
u32
)
->
u32
,
device_get_handle_by_index
:
unsafe
extern
"C"
fn
(
u32
,
*
mut
u64
)
->
u32
,
device_get_pci_info
:
unsafe
extern
"C"
fn
(
u64
,
*
mut
NvmlPciInfo
)
->
u32
,
shutdown
:
unsafe
extern
"C"
fn
()
->
u32
,
}
// SAFETY: NVML functions are thread-safe per NVML documentation
unsafe
impl
Send
for
NvmlHandle
{}
unsafe
impl
Sync
for
NvmlHandle
{}
impl
NvmlHandle
{
/// Try to load NVML. Returns None if libnvidia-ml.so.1 is not available.
pub
fn
try_load
()
->
Option
<
Self
>
{
// SAFETY: We are loading a well-known system library and resolving documented
// NVML API symbols. The library is kept alive for the lifetime of NvmlHandle.
unsafe
{
let
lib
=
Library
::
new
(
"libnvidia-ml.so.1"
)
.ok
()
?
;
// Initialize NVML
let
init
:
Symbol
<
unsafe
extern
"C"
fn
()
->
u32
>
=
lib
.get
(
b
"nvmlInit_v2
\0
"
)
.ok
()
?
;
if
init
()
!=
NVML_SUCCESS
{
tracing
::
warn!
(
"nvmlInit_v2 failed"
);
return
None
;
}
let
device_get_count
:
Symbol
<
unsafe
extern
"C"
fn
(
*
mut
u32
)
->
u32
>
=
lib
.get
(
b
"nvmlDeviceGetCount_v2
\0
"
)
.ok
()
?
;
let
device_get_handle_by_index
:
Symbol
<
unsafe
extern
"C"
fn
(
u32
,
*
mut
u64
)
->
u32
>
=
lib
.get
(
b
"nvmlDeviceGetHandleByIndex_v2
\0
"
)
.ok
()
?
;
let
device_get_pci_info
:
Symbol
<
unsafe
extern
"C"
fn
(
u64
,
*
mut
NvmlPciInfo
)
->
u32
>
=
lib
.get
(
b
"nvmlDeviceGetPciInfo_v3
\0
"
)
.ok
()
?
;
let
shutdown
:
Symbol
<
unsafe
extern
"C"
fn
()
->
u32
>
=
lib
.get
(
b
"nvmlShutdown
\0
"
)
.ok
()
?
;
Some
(
Self
{
device_get_count
:
*
device_get_count
,
device_get_handle_by_index
:
*
device_get_handle_by_index
,
device_get_pci_info
:
*
device_get_pci_info
,
shutdown
:
*
shutdown
,
_
lib
:
lib
,
})
}
}
/// Enumerate ALL GPUs on the system with PCI addresses.
pub
fn
enumerate_gpus
(
&
self
)
->
Vec
<
NvmlGpuInfo
>
{
let
mut
count
:
u32
=
0
;
// SAFETY: NVML API call with valid pointer
unsafe
{
if
(
self
.device_get_count
)(
&
mut
count
)
!=
NVML_SUCCESS
{
tracing
::
warn!
(
"nvmlDeviceGetCount_v2 failed"
);
return
Vec
::
new
();
}
}
let
mut
gpus
=
Vec
::
with_capacity
(
count
as
usize
);
for
i
in
0
..
count
{
let
mut
handle
:
u64
=
0
;
// SAFETY: NVML API call with valid index and pointer
unsafe
{
if
(
self
.device_get_handle_by_index
)(
i
,
&
mut
handle
)
!=
NVML_SUCCESS
{
tracing
::
warn!
(
"nvmlDeviceGetHandleByIndex_v2 failed for index {i}"
);
continue
;
}
}
let
mut
pci_info
=
std
::
mem
::
MaybeUninit
::
<
NvmlPciInfo
>
::
zeroed
();
// SAFETY: NVML API call with valid handle and pointer to zeroed struct
unsafe
{
if
(
self
.device_get_pci_info
)(
handle
,
pci_info
.as_mut_ptr
())
!=
NVML_SUCCESS
{
tracing
::
warn!
(
"nvmlDeviceGetPciInfo_v3 failed for index {i}"
);
continue
;
}
let
pci_info
=
pci_info
.assume_init
();
// Parse bus_id field: "DDDD:BB:DD.F\0" padded with zeros
let
bus_id
=
&
pci_info
.bus_id
;
let
len
=
bus_id
.iter
()
.position
(|
&
b
|
b
==
0
)
.unwrap_or
(
bus_id
.len
());
let
pci_address
=
std
::
str
::
from_utf8
(
&
bus_id
[
..
len
])
.unwrap_or
(
""
)
.to_lowercase
();
if
!
pci_address
.is_empty
()
{
gpus
.push
(
NvmlGpuInfo
{
pci_address
});
}
}
}
gpus
}
}
impl
Drop
for
NvmlHandle
{
fn
drop
(
&
mut
self
)
{
// SAFETY: Matching nvmlInit_v2 call from try_load
unsafe
{
(
self
.shutdown
)();
}
}
}
/// Cached NVML load attempt (None = tried and failed).
static
NVML
:
OnceLock
<
Option
<
NvmlHandle
>>
=
OnceLock
::
new
();
/// Get a reference to the global NVML handle, if available.
pub
(
crate
)
fn
try_nvml
()
->
Option
<&
'static
NvmlHandle
>
{
NVML
.get_or_init
(||
{
let
handle
=
NvmlHandle
::
try_load
();
if
handle
.is_some
()
{
tracing
::
debug!
(
"NVML loaded successfully"
);
}
else
{
tracing
::
debug!
(
"NVML not available, will fall back to CUDA driver enumeration"
);
}
handle
})
.as_ref
()
}
lib/memory/src/pinned.rs
View file @
008683d6
...
...
@@ -127,7 +127,7 @@ impl PinnedStorage {
// Try NUMA-aware allocation unless explicitly disabled
#[cfg(target_os
=
"linux"
)]
let
numa_ptr
=
if
let
Some
(
gpu_id
)
=
device_id
{
if
!
super
::
numa
::
is_numa_
dis
abled
()
{
if
super
::
numa
::
is_numa_
en
abled
()
{
match
super
::
numa
::
worker_pool
::
NumaWorkerPool
::
global
()
.allocate_pinned_for_gpu
(
len
,
gpu_id
)
{
...
...
lib/runtime/examples/Cargo.lock
View file @
008683d6
...
...
@@ -47,9 +47,9 @@ checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
[[package]]
name = "arc-swap"
version = "1.9.
0
"
version = "1.9.
1
"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "
a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6
"
checksum = "
6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207
"
dependencies = [
"rustversion",
]
...
...
@@ -979,9 +979,9 @@ dependencies = [
[[package]]
name = "fastrand"
version = "2.
3.0
"
version = "2.
4.1
"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "
37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c4
3a
6
c9
c0355411be
"
checksum = "
9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d569
3a
a
c9
b4dd6defd8d6
"
[[package]]
name = "fiat-crypto"
...
...
lib/runtime/src/logging.rs
View file @
008683d6
...
...
@@ -955,7 +955,7 @@ pub fn init() {
}
#[cfg(feature
=
"tokio-console"
)]
fn
setup_logging
()
{
fn
setup_logging
()
->
Result
<
(),
Box
<
dyn
std
::
error
::
Error
>>
{
let
tokio_console_layer
=
console_subscriber
::
ConsoleLayer
::
builder
()
.with_default_env
()
.server_addr
(([
0
,
0
,
0
,
0
],
console_subscriber
::
Server
::
DEFAULT_PORT
))
...
...
@@ -973,6 +973,7 @@ fn setup_logging() {
.with
(
l
)
.with
(
tokio_console_layer
.with_filter
(
tokio_console_target
))
.init
();
Ok
(())
}
#[cfg(not(feature
=
"tokio-console"
))]
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment