Unverified Commit 0098db9e authored by pschlan-amd's avatar pschlan-amd Committed by GitHub
Browse files

[ROCm] Implement GPU-to-NUMA-node detection (#40015)


Signed-off-by: default avatarPatrick Schlangen <pschlan@amd.com>
Co-authored-by: default avatarTJian <tunjian.tan@embeddedllm.com>
parent 53ecc807
......@@ -155,9 +155,9 @@ switch to `--physcpubind=<cpu-list> --membind=<node>`.
These `--numa-bind*` options only apply to GPU execution processes. They do not
configure the CPU backend's separate thread-affinity controls. Automatic
GPU-to-NUMA detection is currently implemented for CUDA/NVML-based platforms;
other GPU backends must provide explicit binding lists if they use these
options.
GPU-to-NUMA detection is currently implemented for CUDA/NVML-based as well as
ROCM-based platforms; other GPU backends must provide explicit binding lists if
they use these options.
`--numa-bind-nodes` takes one non-negative NUMA node index per visible GPU, in
the same order as the GPU indices.
......
......@@ -33,6 +33,7 @@ try:
amdsmi_init,
amdsmi_shut_down,
amdsmi_topo_get_link_type,
amdsmi_topo_get_numa_node_number,
)
except ImportError as e:
logger.warning("Failed to import from amdsmi with %r", e)
......@@ -955,3 +956,30 @@ class RocmPlatform(Platform):
rms_norm = default
return IrOpPriorityConfig.with_default(default, rms_norm=rms_norm)
@classmethod
@with_amdsmi_context
def get_all_device_numa_nodes(cls) -> list[int] | None:
"""Get NUMA nodes for all visible GPU devices."""
try:
handles = amdsmi_get_processor_handles()
numa_nodes = []
for device_id in range(cls.device_count()):
physical_device_id = cls.device_id_to_physical_device_id(device_id)
try:
numa_node = amdsmi_topo_get_numa_node_number(
handles[physical_device_id]
)
except AmdSmiException as e:
logger.warning(
"Could not detect NUMA node for GPU %d, "
"disabling automatic NUMA binding: %s",
device_id,
e,
)
return None
numa_nodes.append(numa_node)
return numa_nodes
except Exception as e:
logger.warning("Failed to get NUMA nodes for GPUs: %s", e)
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment