Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
0853129a
Unverified
Commit
0853129a
authored
Mar 05, 2026
by
Ziqi Fan
Committed by
GitHub
Mar 05, 2026
Browse files
fix: make KVBM respect CUDA_VISIBLE_DEVICES for NUMA binding (#6931)
Signed-off-by:
Ziqi Fan
<
ziqif@nvidia.com
>
parent
f6b4da08
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
10 deletions
+52
-10
lib/memory/src/numa/mod.rs
lib/memory/src/numa/mod.rs
+52
-10
No files found.
lib/memory/src/numa/mod.rs
View file @
0853129a
...
@@ -92,32 +92,69 @@ pub fn get_current_cpu_numa_node() -> NumaNode {
...
@@ -92,32 +92,69 @@ pub fn get_current_cpu_numa_node() -> NumaNode {
}
}
}
}
/// Resolve process-local CUDA device index to the physical identifier for nvidia-smi.
///
/// When `CUDA_VISIBLE_DEVICES` is set, the process sees a remapped device space (e.g. only
/// GPU 2 visible as device 0). nvidia-smi's `-i` flag expects the *physical* device index or
/// UUID, not the process-local index. This function parses `CUDA_VISIBLE_DEVICES` to map
/// process-local `device_id` to the correct physical identifier.
///
/// Returns the identifier string to pass to `nvidia-smi -i` (physical index or UUID).
fn
cuda_device_id_to_nvidia_smi_id
(
device_id
:
u32
)
->
String
{
let
visible
=
match
std
::
env
::
var
(
"CUDA_VISIBLE_DEVICES"
)
{
Ok
(
v
)
if
!
v
.trim
()
.is_empty
()
=>
v
,
_
=>
return
device_id
.to_string
(),
// No remapping: identity
};
// Parse comma-separated list. Supports: "0,1,2", "2,3", "GPU-uuid", "2,GPU-uuid", etc.
let
devices
:
Vec
<&
str
>
=
visible
.split
(
','
)
.map
(|
s
|
s
.trim
())
.filter
(|
s
|
!
s
.is_empty
())
.collect
();
if
device_id
as
usize
>=
devices
.len
()
{
tracing
::
warn!
(
"device_id {} out of range for CUDA_VISIBLE_DEVICES ({} devices), using identity"
,
device_id
,
devices
.len
()
);
return
device_id
.to_string
();
}
let
id
=
devices
[
device_id
as
usize
];
id
.to_string
()
}
/// Get NUMA node for a GPU device.
/// Get NUMA node for a GPU device.
///
///
/// For GPU memory, the NUMA affinity depends on which PCIe bus the GPU is attached to.
/// For GPU memory, the NUMA affinity depends on which PCIe bus the GPU is attached to.
/// This is queried via nvidia-smi. Falls back to a heuristic (device_id % 2) if nvidia-smi
/// This is queried via nvidia-smi. Falls back to a heuristic (device_id % 2) if nvidia-smi
/// is unavailable.
/// is unavailable.
///
///
/// When `CUDA_VISIBLE_DEVICES` is set, the process-local `device_id` is correctly mapped
/// to the physical GPU identifier before querying nvidia-smi, so NUMA attribution is accurate.
///
/// # Arguments
/// # Arguments
/// * `device_id` - CUDA device index (0, 1, 2, ...)
/// * `device_id` - CUDA device index (0, 1, 2, ...)
as seen by the process
///
///
/// # Returns
/// # Returns
/// The NUMA node closest to the specified GPU, or a heuristic fallback.
/// The NUMA node closest to the specified GPU, or a heuristic fallback.
pub
fn
get_device_numa_node
(
device_id
:
u32
)
->
NumaNode
{
pub
fn
get_device_numa_node
(
device_id
:
u32
)
->
NumaNode
{
let
nvidia_smi_id
=
cuda_device_id_to_nvidia_smi_id
(
device_id
);
// Use nvidia-smi topo to get NUMA ID of nearest CPU
// Use nvidia-smi topo to get NUMA ID of nearest CPU
//
This directly returns the NUMA no
de
//
-i must be physical device index or UUID, not process-local in
de
x
let
output
=
match
Command
::
new
(
"nvidia-smi"
)
let
output
=
match
Command
::
new
(
"nvidia-smi"
)
.args
([
.args
([
"topo"
,
"--get-numa-id-of-nearby-cpu"
,
"-i"
,
&
nvidia_smi_id
])
"topo"
,
"--get-numa-id-of-nearby-cpu"
,
"-i"
,
&
device_id
.to_string
(),
])
.output
()
.output
()
{
{
Ok
(
out
)
if
out
.status
.success
()
=>
out
,
Ok
(
out
)
if
out
.status
.success
()
=>
out
,
_
=>
{
_
=>
{
tracing
::
warn!
(
"nvidia-smi failed for GPU {}, using heuristic"
,
device_id
);
tracing
::
warn!
(
"nvidia-smi failed for GPU {} (nvidia-smi -i {}), using heuristic"
,
device_id
,
nvidia_smi_id
);
return
NumaNode
(
device_id
%
2
);
return
NumaNode
(
device_id
%
2
);
}
}
};
};
...
@@ -127,7 +164,12 @@ pub fn get_device_numa_node(device_id: u32) -> NumaNode {
...
@@ -127,7 +164,12 @@ pub fn get_device_numa_node(device_id: u32) -> NumaNode {
&&
let
Some
(
numa_str
)
=
line
.split
(
':'
)
.nth
(
1
)
&&
let
Some
(
numa_str
)
=
line
.split
(
':'
)
.nth
(
1
)
&&
let
Ok
(
node
)
=
numa_str
.trim
()
.parse
::
<
u32
>
()
&&
let
Ok
(
node
)
=
numa_str
.trim
()
.parse
::
<
u32
>
()
{
{
tracing
::
trace!
(
"GPU {} on NUMA node {}"
,
device_id
,
node
);
tracing
::
trace!
(
"GPU {} (physical {}) on NUMA node {}"
,
device_id
,
nvidia_smi_id
,
node
);
return
NumaNode
(
node
);
return
NumaNode
(
node
);
}
}
tracing
::
warn!
(
"Failed to get NUMA node for GPU {}"
,
device_id
);
tracing
::
warn!
(
"Failed to get NUMA node for GPU {}"
,
device_id
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment