Unverified Commit ed733802 authored by Qidong Su's avatar Qidong Su Committed by GitHub
Browse files

Fix NUMA binding on non-CDMM Grace-Blackwell systems (#39361)


Signed-off-by: default avatarQidong Su <soodoshll@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 8a34c508
...@@ -661,7 +661,18 @@ class NvmlCudaPlatform(CudaPlatformBase): ...@@ -661,7 +661,18 @@ class NvmlCudaPlatform(CudaPlatformBase):
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
try: try:
return pynvml.nvmlDeviceGetNumaNodeId(handle) numa_node = pynvml.nvmlDeviceGetNumaNodeId(handle)
if cls._numa_node_has_cpus(numa_node):
return numa_node
# On non-CDMM Grace-Blackwell systems (e.g. GB200), each GPU's HBM
# is a separate NUMA node with no CPUs. Fall through to
# CPU-affinity-based detection to find the nearest CPU node.
logger.debug(
"NUMA node %d for GPU %d has no CPUs (non-CDMM topology), "
"falling back to CPU-affinity-based detection",
numa_node,
device_id,
)
except Exception: except Exception:
pass pass
...@@ -681,6 +692,17 @@ class NvmlCudaPlatform(CudaPlatformBase): ...@@ -681,6 +692,17 @@ class NvmlCudaPlatform(CudaPlatformBase):
return None return None
@classmethod
def _numa_node_has_cpus(cls, node_id: int) -> bool:
"""Check whether a NUMA node has any CPUs assigned to it."""
from pathlib import Path
cpulist_file = Path(f"/sys/devices/system/node/node{node_id}/cpulist")
try:
return cpulist_file.read_text().strip() != ""
except (OSError, ValueError):
return False
@classmethod @classmethod
def _get_device_cpu_affinity(cls, handle) -> list[int]: def _get_device_cpu_affinity(cls, handle) -> list[int]:
"""Get the list of CPU IDs associated with a GPU via NVML.""" """Get the list of CPU IDs associated with a GPU via NVML."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment