numactl.py 2.99 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Build numactl command fragments for runner modes."""

from omegaconf import ListConfig

GPU_AFFINITY = 'gpu_affinity'
GPU_NUMA_AFFINITY_ENV = 'SB_GPU_NUMA_AFFINITY'


def _format_template_value(value, mode):
    """Format a mode template value."""
    if isinstance(value, str):
        return value.format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)
    if isinstance(value, (list, tuple, ListConfig)):
        return ','.join(_format_template_value(item, mode) for item in value)
    return str(value)


def _is_disabled_value(value):
    """Return whether a config value disables the corresponding option."""
    return value is None or value is False or (isinstance(value, str) and value.lower() in ['none', 'null', 'false'])


def _resolve_node_value(value, mode):
    """Resolve a numactl NUMA-node value.

    Args:
        value: numactl node binding config value.
        mode (DictConfig): Runner mode.

    Returns:
        tuple[str | None, bool]: Resolved value and whether it uses GPU affinity.
    """
    if _is_disabled_value(value):
        return None, False
    if isinstance(value, str) and value.lower() == GPU_AFFINITY:
        return '${%s}' % GPU_NUMA_AFFINITY_ENV, True
    return _format_template_value(value, mode), False


def _resolve_cpu_value(value, mode):
    """Resolve a numactl CPU-list value."""
    if _is_disabled_value(value):
        return None
    if isinstance(value, str) and value.lower() == GPU_AFFINITY:
        raise ValueError('gpu_affinity is not supported for numactl.physcpubind.')
    return _format_template_value(value, mode)


def get_local_numactl_command(mode):
    """Get setup and numactl command fragments for local mode.

    Args:
        mode (DictConfig): Runner mode.

    Returns:
        tuple[str, str]: Setup command and numactl command.
    """
    if 'numactl' not in mode:
        return '', ''

    numactl_config = mode.numactl
    if numactl_config is None:
        return '', ''

    cpunodebind, cpunodebind_uses_gpu = _resolve_node_value(numactl_config.get('cpunodebind', None), mode)
    membind, membind_uses_gpu = _resolve_node_value(numactl_config.get('membind', None), mode)
    physcpubind = _resolve_cpu_value(numactl_config.get('physcpubind', None), mode)
    if cpunodebind is None and membind is None and physcpubind is None:
        return '', ''

    setup_command = ''
    if cpunodebind_uses_gpu or membind_uses_gpu:
        gpu_id = _format_template_value(numactl_config.get('gpu_id', '{proc_rank}'), mode)
        setup_command = '{}=$(sb node topo --get gpu-numa-affinity --gpu-id {})'.format(
            GPU_NUMA_AFFINITY_ENV,
            gpu_id,
        )

    numactl_parts = ['numactl']
    if cpunodebind is not None:
        numactl_parts.extend(['-N', cpunodebind])
    if membind is not None:
        numactl_parts.extend(['-m', membind])
    if physcpubind is not None:
        numactl_parts.extend(['-C', physcpubind])

    return setup_command, ' '.join(numactl_parts)