ompmultiprocessing.py 11.4 KB
Newer Older
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""OMP Aware Multiprocessing manager for running multiprocessing.Process()
Copyright (c) 2026 Red Hat Inc
Copyright (c) 2026 Cambridge Greys Ltd
"""

import os
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from collections.abc import Callable
from contextlib import contextmanager
from typing import TYPE_CHECKING

import vllm.utils.cpu_resource_utils as cr_utils
from vllm import envs
from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.cpu_resource_utils import LogicalCPUInfo

if TYPE_CHECKING:
    from vllm.config import VllmConfig

logger = init_logger(__name__)


25
class OMPProcessManager:
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    def __init__(self, config: "VllmConfig"):
        if not current_platform.is_cpu():
            return

        self.local_world_size = config.parallel_config.local_world_size
        self.local_dp_rank = config.parallel_config.data_parallel_rank_local
        # This is a bit tricky because the internal DP size
        # is always 1 for non-MoE models
        self.internal_dp_size = config.parallel_config._api_process_count

        self.simulate_multi_node = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0"
        ld_preload_str = os.getenv("LD_PRELOAD", "")
        self.use_iomp = "libiomp" in ld_preload_str or "libomp" in ld_preload_str
        self.use_gomp = "libgomp" in ld_preload_str

        assert not (self.use_iomp and self.use_gomp)

        # at least reserve 1/local_world_size(for ARM) core for scheduler
        # proc as always use MP executor
        # TODO: make scheduler proc sleep when idle
        self.reserve_cpu_num = (
            self.local_world_size
            if current_platform.get_cpu_architecture() == CpuArchEnum.ARM
            else 1
        )
        # reserve at one more core for nixl_connector under p/d case
        if config.kv_transfer_config:
            self.reserve_cpu_num += 1

        if envs.VLLM_CPU_NUM_OF_RESERVED_CPU is not None:
            if self.reserve_cpu_num > envs.VLLM_CPU_NUM_OF_RESERVED_CPU:
                msg = (
                    f"VLLM_CPU_NUM_OF_RESERVED_CPU is less than "
                    "the minimum requirement"
                    f": {self.reserve_cpu_num} cores"
                )
                logger.warning(msg=msg)
            self.reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU

        self._parse_omp_threads_bind_env()

        assert not self.simulate_multi_node or self.auto_setup

    @contextmanager
    def configure_omp_envs(self, rank: int, local_rank: int):
        if not current_platform.is_cpu() or self.skip_setup:
            yield
            return

        envs_dict = {}
        cpu_list = [str(i) for i in self.cpu_lists[local_rank]]
        envs_dict["OMP_NUM_THREADS"] = str(len(cpu_list))
        if self.use_iomp:
            # set IOMP envs
            cpu_list_str = ",".join(cpu_list)
            envs_dict["KMP_AFFINITY"] = (
                f"granularity=fine,explicit,proclist=[{cpu_list_str}]"
            )
            # The time(milliseconds) that a thread should wait after
            # completing the execution of a parallel region, before sleeping.
            envs_dict["KMP_BLOCKTIME"] = "1"
            # Prevents the CPU to run into low performance state
            envs_dict["KMP_TPAUSE"] = "0"
            # Provides fine granularity parallelism
            envs_dict["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
            envs_dict["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
            envs_dict["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
        elif self.use_gomp:
            # set GOMP envs
            # likes '0 1 2 ...'
            cpu_list_str = " ".join(cpu_list)
            envs_dict["GOMP_CPU_AFFINITY"] = cpu_list_str
        else:
            # set OMP envs
            # likes '{0,1,2,...}'
            cpu_list_str = ",".join(cpu_list)
            envs_dict["OMP_PLACES"] = f"{{{cpu_list_str}}}"
            envs_dict["OMP_PROC_BIND"] = "true"

        # backup envs
        old_envs_dict = {}
        for k in envs_dict:
            old_envs_dict[k] = os.environ.get(k)

        try:
            # set envs
            for k, v in envs_dict.items():
                os.environ[k] = v
            yield
        finally:
            # restore old envs
            for k, v in old_envs_dict.items():  # type: ignore
                if v is None:
                    os.environ.pop(k, None)
                else:
                    os.environ[k] = v

    def _parse_omp_threads_bind_env(self):
        vllm_mask = envs.VLLM_CPU_OMP_THREADS_BIND
        self.skip_setup = vllm_mask == "nobind"
        self.auto_setup = vllm_mask == "auto"
        self.reserved_cpu_list = []
        self.cpu_lists = []

        if self.auto_setup:
            # auto generate CPU lists
            cpu_arch = current_platform.get_cpu_architecture()
            if cpu_arch == CpuArchEnum.POWERPC:
                # For POWERPC SMT-8/4/2
                cpu_list, reserve_list = self._get_autobind_cpu_ids(
                    lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
                )
            elif cpu_arch in (CpuArchEnum.X86, CpuArchEnum.S390X):
                # For x86/S390X SMT-2, use 1 logical CPU per physical core
                cpu_list, reserve_list = self._get_autobind_cpu_ids(
                    lambda cpus: cpus[-1:]
                )
            elif cpu_arch == CpuArchEnum.ARM:
                # For AArch64, no SMT, use all logical CPU
                cpu_list, reserve_list = self._get_autobind_cpu_ids(lambda cpus: cpus)
146
            else:
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
                cpu_list, reserve_list = [], []
                raise RuntimeError(f"{cpu_arch} doesn't support auto CPU binding.")

            for item in cpu_list:
                self.cpu_lists.append([x.id for x in item])
            self.reserved_cpu_list = [x.id for x in reserve_list]
        elif not self.skip_setup:
            # user defined CPU lists
            omp_cpuids_list = vllm_mask.split("|")
            if self.local_dp_rank is not None:
                local_dp_rank = self.local_dp_rank
                world_size = self.local_world_size
                # Rank mapping [DP, PP, TP]
                omp_cpuids_list = omp_cpuids_list[
                    local_dp_rank * world_size : (local_dp_rank + 1) * world_size
                ]

            assert len(omp_cpuids_list) == self.local_world_size, (
                "Given "
                f"number of CPU id list {omp_cpuids_list} doesn't match "
                f"local world size {self.local_world_size}."
            )

            # parse CPU list strings like "5,2-4" to [5, 2, 3, 4]
            self.cpu_lists = [cr_utils.parse_id_list(s) for s in omp_cpuids_list]
        else:
            # skip
            self.cpu_lists = []

        msg = "OpenMP thread binding info: \n"
        for i in range(self.local_world_size):
            msg += f"\tlocal_rank={i}, core ids={self.cpu_lists[i]}\n"
        msg += f"\treserved_cpus={self.reserved_cpu_list}"
        logger.info(msg)

    def _get_autobind_cpu_ids(
        self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
    ) -> tuple[list[list[LogicalCPUInfo]], list[LogicalCPUInfo]]:
        """
        Return CPU ids to bind based on NUMA nodes, and CPU ids reserved for
        other processes.
        Currently for rank N, only CPU ids on the N-th node in available NUMA
        node list will be selected.
        Args:
            cpu_selector: a callable object to select CPUs from a CPU list
            of a physical core. The input is a LogicalCPUInfo list contains
            logical CPUs of a physical CPU, sorted by the LogicalCPUInfo.id.
            A selected LogicalCPUInfo list should be returned.
        """

        # this memory node list has been sliced for DP offset
        allowed_numa_nodes = cr_utils.get_visible_memory_node()
        logical_cpu_list = cr_utils.get_allowed_cpu_list()

        local_world_size = self.local_world_size
        assert (
            len(allowed_numa_nodes) >= local_world_size or self.simulate_multi_node
        ), (
            f"Not enough allowed NUMA nodes to bind threads of "
            f"{local_world_size} local CPUWorkers. "
            f"Allowed NUMA nodes are {allowed_numa_nodes}. "
            "Please try to bind threads manually or decrease DP/TP/PP."
        )

        # Generate OMP CPU list for each rank
        cpu_lists_of_ranks = []
        reserved_cpu_list = []
        total_cpu_num = 0
        for local_rank in range(self.local_world_size):
            if not self.simulate_multi_node:
                selected_numa_node = allowed_numa_nodes[local_rank]
                selected_logical_cpu_list = [
                    x for x in logical_cpu_list if x.numa_node == selected_numa_node
                ]
221
            else:
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
                world_size_across_dp = self.local_world_size * self.internal_dp_size
                assert len(logical_cpu_list) >= world_size_across_dp
                selected_logical_cpu_list = sorted(
                    logical_cpu_list, key=lambda x: x.numa_node
                )
                sim_cpu_num_per_node = (
                    len(selected_logical_cpu_list) // world_size_across_dp
                )
                assert self.local_dp_rank is not None
                start_idx = (
                    local_rank + self.local_world_size * self.local_dp_rank
                ) * sim_cpu_num_per_node
                selected_logical_cpu_list = selected_logical_cpu_list[
                    start_idx : (start_idx + sim_cpu_num_per_node)
                ]

            # Select logical CPUs on same physical cores via cpu_selector
            core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}
            for cpu_info in selected_logical_cpu_list:
                if cpu_info.physical_core not in core_to_cpus:
                    core_to_cpus[cpu_info.physical_core] = []
                core_to_cpus[cpu_info.physical_core].append(cpu_info)
            selected_logical_cpu_list = []
            for cpu_list in core_to_cpus.values():
                cpu_list = sorted(cpu_list, key=lambda x: x.id)
                selected_logical_cpu_list.extend(cpu_selector(cpu_list))

            # sort selected cores based on core id
            selected_logical_cpu_list = sorted(
                selected_logical_cpu_list, key=lambda x: x.id
252
253
            )

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
            cpu_lists_of_ranks.append(selected_logical_cpu_list)
            total_cpu_num += len(selected_logical_cpu_list)

        # Reserve CPUs for other processes
        if total_cpu_num <= self.reserve_cpu_num:
            logger.warning(
                "Selected CPU core number (%s) "
                "should be greater than reserved CPU core "
                "number (%s).",
                total_cpu_num,
                self.reserve_cpu_num,
            )
            return cpu_lists_of_ranks, []

        reserve_num_per_rank = [
            self.reserve_cpu_num // self.local_world_size
        ] * self.local_world_size
        # last rank first
        for i in range(
            self.local_world_size - 1,
            self.local_world_size - 1 - self.reserve_cpu_num % self.local_world_size,
            -1,
        ):
            reserve_num_per_rank[i] += 1
        for i in range(self.local_world_size):
            num = reserve_num_per_rank[i]
            if num > 0:
                reserved_cpu_list.extend(cpu_lists_of_ranks[i][-num:])
                cpu_lists_of_ranks[i] = cpu_lists_of_ranks[i][:-num]

        return cpu_lists_of_ranks, reserved_cpu_list