gpu_topology.py 2.25 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""GPU topology utilities."""

import json
import re

from superbench.common.utils.process import run_command


def _validate_numa_node_list(value):
    """Validate a numactl NUMA node list."""
    value = str(value)
    if not value:
        raise ValueError('empty NUMA node list')
    for item in value.split(','):
        if re.fullmatch(r'\d+', item):
            continue
        match = re.fullmatch(r'(\d+)-(\d+)', item)
        if match and int(match.group(1)) <= int(match.group(2)):
            continue
        raise ValueError('invalid NUMA node list: {}'.format(value))


def get_gpu_numa_map():
    """Get NUMA topology for all local GPUs.

    Returns:
        dict: GPU NUMA topology keyed by GPU id.
    """
    output = run_command('hy-smi --showtoponuma --json', quiet=True)
    if output.returncode != 0:
        raise RuntimeError('Failed to get GPU NUMA topology from hy-smi - message: {}'.format(output.stdout))

    try:
        hygon_topology = json.loads(output.stdout)
        gpu_numa_map = {}
        for card, card_topology in hygon_topology.items():
            match = re.fullmatch(r'card(\d+)', card)
            if not match:
                continue
            gpu_id = int(match.group(1))
            numa_node = card_topology['(Topology) Numa Node']
            numa_affinity = card_topology.get('(Topology) Numa Affinity', numa_node)
            int(numa_node)
            _validate_numa_node_list(numa_affinity)
            gpu_numa_map[gpu_id] = {
                'numa_node': numa_node,
                'numa_affinity': numa_affinity,
            }
        if not gpu_numa_map:
            raise ValueError('no card topology found')
    except Exception as e:
        raise RuntimeError('Failed to parse GPU NUMA topology from hy-smi - message: {}'.format(e))

    return gpu_numa_map


def get_gpu_numa_affinity(gpu_id):
    """Get NUMA affinity for a GPU.

    Args:
        gpu_id (int): GPU id.

    Returns:
        str: GPU NUMA affinity.
    """
    try:
        gpu_id = int(gpu_id)
        return get_gpu_numa_map()[gpu_id]['numa_affinity']
    except Exception as e:
        raise RuntimeError('Failed to get GPU NUMA affinity - gpu_id: {}, message: {}'.format(gpu_id, e))