Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d02421a7
Unverified
Commit
d02421a7
authored
Apr 17, 2026
by
Li, Jiang
Committed by
GitHub
Apr 17, 2026
Browse files
[CPU] Refactor CPU affinity and memory management (#39781)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
b1dc87a0
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
723 additions
and
425 deletions
+723
-425
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
...ite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+19
-19
.github/workflows/macos-smoke-test.yml
.github/workflows/macos-smoke-test.yml
+1
-0
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+15
-14
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+4
-0
csrc/cpu/utils.cpp
csrc/cpu/utils.cpp
+73
-6
docker/Dockerfile.cpu
docker/Dockerfile.cpu
+2
-1
tests/models/language/generation/test_common.py
tests/models/language/generation/test_common.py
+1
-6
vllm/config/cache.py
vllm/config/cache.py
+0
-3
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+51
-157
vllm/utils/cpu_resource_utils.py
vllm/utils/cpu_resource_utils.py
+173
-0
vllm/utils/ompmultiprocessing.py
vllm/utils/ompmultiprocessing.py
+271
-187
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+6
-15
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+1
-15
vllm/v1/worker/cpu_worker.py
vllm/v1/worker/cpu_worker.py
+106
-2
No files found.
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
View file @
d02421a7
...
...
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
exit
1
fi
#
echo "--- DP+TP"
#
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
#
server_pid=$!
#
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
#
vllm bench serve \
#
--backend vllm \
#
--dataset-name random \
#
--model meta-llama/Llama-3.2-3B-Instruct \
#
--num-prompts 20 \
#
--result-dir ./test_results \
#
--result-filename dp_pp.json \
#
--save-result \
#
--endpoint /v1/completions
#
kill -s SIGTERM $server_pid; wait $server_pid || true
#
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
#
if [ "$failed_req" -ne 0 ]; then
#
echo "Some requests were failed!"
#
exit 1
#
fi
echo
"--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-dp
=
2
--max-model-len
=
4096 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--result-dir
./test_results
\
--result-filename
dp_pp.json
\
--save-result
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
failed_req
=
$(
jq
'.failed'
./test_results/dp_pp.json
)
if
[
"
$failed_req
"
-ne
0
]
;
then
echo
"Some requests were failed!"
exit
1
fi
.github/workflows/macos-smoke-test.yml
View file @
d02421a7
...
...
@@ -45,6 +45,7 @@ jobs:
-
name
:
Smoke test vllm serve
run
:
|
# Start server in background
VLLM_CPU_KVCACHE_SPACE=1 \
vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2K \
--load-format=dummy \
...
...
cmake/cpu_extension.cmake
View file @
d02421a7
...
...
@@ -30,6 +30,21 @@ else()
list
(
APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION"
)
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim
(
VLLM_TORCH_GOMP_SHIM_DIR
)
find_library
(
OPEN_MP
NAMES gomp
PATHS
${
VLLM_TORCH_GOMP_SHIM_DIR
}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if
(
OPEN_MP
)
set
(
ENV{LD_LIBRARY_PATH}
"
${
VLLM_TORCH_GOMP_SHIM_DIR
}
:$ENV{LD_LIBRARY_PATH}"
)
endif
()
endif
()
if
(
NOT MACOSX_FOUND
)
...
...
@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
if
(
NOT NPROC
)
set
(
NPROC 4
)
endif
()
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim
(
VLLM_TORCH_GOMP_SHIM_DIR
)
find_library
(
OPEN_MP
NAMES gomp
PATHS
${
VLLM_TORCH_GOMP_SHIM_DIR
}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if
(
OPEN_MP
)
set
(
ENV{LD_LIBRARY_PATH}
"
${
VLLM_TORCH_GOMP_SHIM_DIR
}
:$ENV{LD_LIBRARY_PATH}"
)
endif
()
# Fetch and populate ACL
if
(
DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY
"$ENV{ACL_ROOT_DIR}"
)
...
...
csrc/cpu/torch_bindings.cpp
View file @
d02421a7
...
...
@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc,
torch
::
Tensor
slot_mapping
,
const
int64_t
block_size
);
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
);
namespace
cpu_utils
{
void
eagle_prepare_inputs_padded_kernel_impl
(
const
torch
::
Tensor
&
cu_num_draft_tokens
,
...
...
@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"block_size) -> ()"
,
&
compute_slot_mapping_kernel_impl
);
ops
.
def
(
"init_cpu_memory_env(SymInt[] node_ids) -> ()"
,
&
init_cpu_memory_env
);
// Speculative decoding kernels
ops
.
def
(
"eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, "
...
...
csrc/cpu/utils.cpp
View file @
d02421a7
...
...
@@ -13,13 +13,80 @@
#include "cpu/utils.hpp"
#ifdef VLLM_NUMA_DISABLED
std
::
string
init_cpu_threads_env
(
const
std
::
string
&
cpu_ids
)
{
return
std
::
string
(
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
"no effect to setup thread affinity."
);
}
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
)
{}
#else
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
)
{
// Memory node binding
if
(
numa_available
()
!=
-
1
)
{
// Concatenate all node_ids into a single comma-separated string
if
(
!
node_ids
.
empty
())
{
std
::
string
node_ids_str
;
for
(
const
int
node_id
:
node_ids
)
{
if
(
!
node_ids_str
.
empty
())
{
node_ids_str
+=
","
;
}
node_ids_str
+=
std
::
to_string
(
node_id
);
}
#endif
bitmask
*
mask
=
numa_parse_nodestring
(
node_ids_str
.
c_str
());
bitmask
*
src_mask
=
numa_get_mems_allowed
();
int
pid
=
getpid
();
if
(
mask
&&
src_mask
)
{
// move all existing pages to the specified numa node.
*
(
src_mask
->
maskp
)
=
*
(
src_mask
->
maskp
)
^
*
(
mask
->
maskp
);
int
page_num
=
numa_migrate_pages
(
pid
,
src_mask
,
mask
);
if
(
page_num
==
-
1
)
{
TORCH_WARN
(
"numa_migrate_pages failed. errno: "
+
std
::
to_string
(
errno
));
}
// Restrict memory allocation to the selected NUMA node(s).
// Enhances memory locality for the threads bound to those NUMA CPUs.
if
(
node_ids
.
size
()
>
1
)
{
errno
=
0
;
numa_set_interleave_mask
(
mask
);
if
(
errno
!=
0
)
{
TORCH_WARN
(
"numa_set_interleave_mask failed. errno: "
+
std
::
to_string
(
errno
));
}
else
{
TORCH_WARN
(
"NUMA binding: Using INTERLEAVE policy for memory "
"allocation across multiple NUMA nodes (nodes: "
+
node_ids_str
+
"). Memory allocations will be "
"interleaved across the specified NUMA nodes."
);
}
}
else
{
errno
=
0
;
numa_set_membind
(
mask
);
if
(
errno
!=
0
)
{
TORCH_WARN
(
"numa_set_membind failed. errno: "
+
std
::
to_string
(
errno
));
}
else
{
TORCH_WARN
(
"NUMA binding: Using MEMBIND policy for memory "
"allocation on the NUMA nodes ("
+
node_ids_str
+
"). Memory allocations will be "
"strictly bound to these NUMA nodes."
);
}
}
numa_set_strict
(
1
);
numa_free_nodemask
(
mask
);
numa_free_nodemask
(
src_mask
);
}
else
{
TORCH_WARN
(
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: "
+
std
::
to_string
(
errno
));
}
}
}
}
#endif // VLLM_NUMA_DISABLED
namespace
cpu_utils
{
ScratchPadManager
::
ScratchPadManager
()
:
size_
(
0
),
ptr_
(
nullptr
)
{
...
...
docker/Dockerfile.cpu
View file @
d02421a7
...
...
@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \
uv pip install -r requirements/lint.txt && \
uv pip install -r requirements/test/cpu.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"]
...
...
tests/models/language/generation/test_common.py
View file @
d02421a7
...
...
@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [
),
pytest
.
param
(
"openai-community/gpt2"
,
# gpt2
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
],
),
pytest
.
param
(
"Milos/slovak-gpt-j-405M"
),
# gptj
pytest
.
param
(
"bigcode/tiny_starcoder_py"
),
# gpt_bigcode
...
...
@@ -143,11 +143,6 @@ def test_models(
# in parts of the operators
pytest
.
skip
(
f
"Skipping '
{
model
}
' model test with AITER kernel."
)
if
current_platform
.
is_cpu
()
and
model
in
(
"openai-community/gpt2"
,):
# These models are sensitive to the rounding error
# Fuse ops to reduce rounding
monkeypatch
.
setenv
(
"VLLM_CPU_CI_ENV"
,
"0"
)
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
...
...
vllm/config/cache.py
View file @
d02421a7
...
...
@@ -101,8 +101,6 @@ class CacheConfig:
kv_cache_dtype_skip_layers
:
list
[
str
]
=
field
(
default_factory
=
list
)
"""Layer patterns to skip KV cache quantization. Accepts layer indices
(e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
cpu_kvcache_space_bytes
:
int
|
None
=
None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded
:
int
|
None
=
None
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
...
...
@@ -183,7 +181,6 @@ class CacheConfig:
"num_gpu_blocks_override"
,
"enable_prefix_caching"
,
"prefix_caching_hash_algo"
,
"cpu_kvcache_space_bytes"
,
"mamba_page_size_padded"
,
"user_specified_block_size"
,
"user_specified_mamba_block_size"
,
...
...
vllm/platforms/cpu.py
View file @
d02421a7
...
...
@@ -6,15 +6,16 @@ import os
import
platform
import
subprocess
import
sys
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
import
psutil
import
torch
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.utils.ompmultiprocessing
import
OMPProcessManager
from
vllm.utils.cpu_resource_utils
import
(
DEVICE_CONTROL_ENV_VAR
,
get_memory_node_info
,
)
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.torch_utils
import
is_quantized_kv_cache
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
...
...
@@ -38,49 +39,13 @@ def get_max_threads(pid=0):
raise
NotImplementedError
(
"Unsupported OS"
)
@
dataclass
class
LogicalCPUInfo
:
id
:
int
=
-
1
physical_core
:
int
=
-
1
numa_node
:
int
=
-
1
@
classmethod
def
_int
(
cls
,
value
:
str
)
->
int
:
try
:
int_value
=
int
(
value
)
except
Exception
:
int_value
=
-
1
return
int_value
@
staticmethod
def
json_decoder
(
obj_dict
:
dict
):
id
=
obj_dict
.
get
(
"cpu"
)
physical_core
=
obj_dict
.
get
(
"core"
)
numa_node
=
obj_dict
.
get
(
"node"
)
if
not
(
id
is
None
or
physical_core
is
None
or
numa_node
is
None
):
return
LogicalCPUInfo
(
id
=
LogicalCPUInfo
.
_int
(
id
),
physical_core
=
LogicalCPUInfo
.
_int
(
physical_core
),
numa_node
=
LogicalCPUInfo
.
_int
(
numa_node
),
)
else
:
return
obj_dict
class
CpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
CPU
device_name
:
str
=
"cpu"
device_type
:
str
=
"cpu"
dispatch_key
:
str
=
"CPU"
dist_backend
:
str
=
"gloo"
device_control_env_var
=
"CPU_VISIBLE_MEMORY_NODES"
omp_process_manager
=
None
# Simultaneous Multithreading (SMT) level for OpenMP:
# 4 on PowerPC, 1 on non-PowerPC architectures
smt
=
1
global_cpu_mask
=
None
simulate_numa
=
int
(
os
.
environ
.
get
(
"_SIM_MULTI_NUMA"
,
0
))
device_control_env_var
=
DEVICE_CONTROL_ENV_VAR
@
property
def
supported_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
...
...
@@ -123,29 +88,9 @@ class CpuPlatform(Platform):
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.mem_utils
import
format_gib
kv_cache_space
=
envs
.
VLLM_CPU_KVCACHE_SPACE
node_dir
=
"/sys/devices/system/node"
if
kv_cache_space
is
None
:
nodes
=
(
[
d
for
d
in
os
.
listdir
(
node_dir
)
if
d
.
startswith
(
"node"
)]
if
os
.
path
.
exists
(
node_dir
)
else
[]
)
num_numa_nodes
=
len
(
nodes
)
or
1
free_cpu_memory
=
psutil
.
virtual_memory
().
total
//
num_numa_nodes
DEFAULT_CPU_MEM_UTILIZATION
=
0.5
kv_cache_space
=
int
(
free_cpu_memory
*
DEFAULT_CPU_MEM_UTILIZATION
)
logger
.
warning_once
(
"VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache."
,
format_gib
(
kv_cache_space
),
)
else
:
kv_cache_space
*=
GiB_bytes
meminfo
=
get_memory_node_info
(
device_id
)
return
kv_cache_space
return
meminfo
.
total_memory
@
classmethod
def
set_device
(
cls
,
device
:
torch
.
device
)
->
None
:
...
...
@@ -180,6 +125,12 @@ class CpuPlatform(Platform):
"otherwise the performance is not optimized."
)
# Lagecy setting
env_key
=
"VLLM_CPU_KVCACHE_SPACE"
if
env_key
in
os
.
environ
and
os
.
environ
[
env_key
]
!=
""
:
kv_cache_space
=
int
(
os
.
environ
[
env_key
])
cache_config
.
kv_cache_memory_bytes
=
kv_cache_space
*
GiB_bytes
scheduler_config
=
vllm_config
.
scheduler_config
# async scheduling is not required on CPU
scheduler_config
.
async_scheduling
=
False
...
...
@@ -198,8 +149,6 @@ class CpuPlatform(Platform):
)
cache_config
.
cache_dtype
=
"auto"
cache_config
.
cpu_kvcache_space_bytes
=
CpuPlatform
.
get_device_total_memory
()
parallel_config
=
vllm_config
.
parallel_config
# OMP requires the MP executor to function correctly, UniProc is not
# supported as it is not possible to set the OMP environment correctly
...
...
@@ -278,21 +227,45 @@ class CpuPlatform(Platform):
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
# Intel and CLANG OpenMP setting
if
"libiomp5.so"
in
ld_preload_str
or
"libomp5"
in
ld_preload_str
:
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
os
.
environ
[
"KMP_BLOCKTIME"
]
=
"1"
# Prevents the CPU to run into low performance state
os
.
environ
[
"KMP_TPAUSE"
]
=
"0"
# Provides fine granularity parallelism
os
.
environ
[
"KMP_FORKJOIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
cpu_architecture
=
Platform
.
get_cpu_architecture
()
if
(
platform
.
system
()
==
"Linux"
and
cpu_architecture
in
(
CpuArchEnum
.
ARM
,
CpuArchEnum
.
POWERPC
,
CpuArchEnum
.
X86
)
and
not
(
"libomp"
in
ld_preload_str
or
"libgomp"
in
ld_preload_str
or
"libiomp"
in
ld_preload_str
)
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
# one core will be properly utilized when we thread-bind
# See: https://github.com/vllm-project/vllm/issues/27369
# TODO: Remove once:
# https://github.com/pytorch/pytorch/issues/166087 is fixed
# We need to find the location of PyTorch's libgomp
torch_pkg
=
os
.
path
.
dirname
(
torch
.
__file__
)
site_root
=
os
.
path
.
dirname
(
torch_pkg
)
# Search both torch.libs and torch/lib - See:
# https://github.com/vllm-project/vllm/issues/30470
torch_libs_paths
=
[
os
.
path
.
join
(
site_root
,
"torch.libs"
),
os
.
path
.
join
(
torch_pkg
,
"lib"
),
]
pytorch_libgomp_so_candidates
=
[]
for
torch_libs
in
torch_libs_paths
:
pytorch_libgomp_so_candidates
.
extend
(
glob
.
glob
(
os
.
path
.
join
(
torch_libs
,
"libgomp*.so*"
))
)
if
pytorch_libgomp_so_candidates
:
pytorch_libgomp_so
=
pytorch_libgomp_so_candidates
[
0
]
if
ld_preload_str
:
ld_preload_str
+=
":"
ld_preload_str
+=
pytorch_libgomp_so
os
.
environ
[
"LD_PRELOAD"
]
=
ld_preload_str
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
# memory allocation overhead
if
(
...
...
@@ -331,13 +304,6 @@ class CpuPlatform(Platform):
vllm_config
.
model_config
.
max_model_len
,
vllm_config
.
scheduler_config
.
DEFAULT_MAX_NUM_BATCHED_TOKENS
,
)
# CI specific "quick" NUMA simulation - split all available CPUs
# into a fake NUMA topology
if
os
.
environ
.
get
(
"VLLM_CPU_SIM_MULTI_NUMA"
,
None
)
is
not
None
:
os
.
environ
[
"_SIM_MULTI_NUMA"
]
=
str
(
vllm_config
.
parallel_config
.
world_size
*
vllm_config
.
parallel_config
.
_api_process_count
)
@
classmethod
def
update_block_size_for_backend
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
...
...
@@ -345,78 +311,6 @@ class CpuPlatform(Platform):
# Move that logic here so block_size is chosen by the backend.
pass
@
classmethod
def
get_omp_manager
(
cls
)
->
OMPProcessManager
:
# initialise the OMP resource management if need be and return the manager
if
cls
.
omp_process_manager
is
None
:
if
cls
.
get_cpu_architecture
()
==
CpuArchEnum
.
POWERPC
:
cls
.
smt
=
4
cls
.
omp_process_manager
=
OMPProcessManager
(
affinity
=
cls
.
get_global_cpu_mask
(),
smt
=
cls
.
smt
)
# we need to fix up the topology returned by the OMP Manager for
# simulated NUMA environments in CI
if
cls
.
simulate_numa
>
0
:
logger
.
info
(
"Adjusting numa topology to resemble at least %d nodes"
,
int
(
cls
.
simulate_numa
),
)
om
=
cls
.
omp_process_manager
while
len
(
om
.
omp_places
)
<
cls
.
simulate_numa
:
new_omp_places
=
[]
touched
=
False
for
omp_place
in
om
.
omp_places
:
if
len
(
omp_place
[
"mask"
])
>
1
:
touched
=
True
cpu_list
=
sorted
(
list
(
omp_place
[
"mask"
]))
new_omp_places
.
append
(
{
"mask"
:
set
(
cpu_list
[
0
:
int
(
len
(
cpu_list
)
/
2
)]),
"available"
:
True
,
}
)
new_omp_places
.
append
(
{
"mask"
:
set
(
cpu_list
[
int
(
len
(
cpu_list
)
/
2
)
:]),
"available"
:
True
,
}
)
if
touched
:
om
.
omp_places
=
new_omp_places
else
:
raise
ValueError
(
"Cannot split the existing NUMA topology to match "
"simulation requirements"
)
return
cls
.
omp_process_manager
@
classmethod
def
get_global_cpu_mask
(
cls
)
->
set
[
int
]:
# get global cpu mask
if
cls
.
global_cpu_mask
is
None
:
if
hasattr
(
os
,
"sched_getaffinity"
):
cls
.
global_cpu_mask
=
os
.
sched_getaffinity
(
0
)
else
:
# macOS does not support sched_getaffinity
cpu_count
=
os
.
cpu_count
()
or
1
cls
.
global_cpu_mask
=
set
(
range
(
cpu_count
))
return
cls
.
global_cpu_mask
@
classmethod
def
reserve_cpus
(
cls
,
reserve
:
set
[
int
])
->
bool
:
# remove CPUs from global mask, for now there is no "release" mechanism
if
cls
.
omp_process_manager
is
not
None
:
for
place
in
cls
.
omp_process_manager
.
omp_places
:
if
not
place
[
"available"
]:
return
False
cls
.
global_cpu_mask
=
cls
.
get_global_cpu_mask
()
-
reserve
# reinitialize OMP resource management
cls
.
omp_process_manager
=
OMPProcessManager
(
affinity
=
cls
.
global_cpu_mask
,
smt
=
cls
.
smt
)
return
True
@
classmethod
def
discover_numa_topology
(
cls
)
->
list
[
list
[
int
]]:
"""
...
...
vllm/utils/cpu_resource_utils.py
0 → 100644
View file @
d02421a7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
os
import
platform
import
subprocess
from
dataclasses
import
dataclass
from
functools
import
cache
import
psutil
import
regex
as
re
DEVICE_CONTROL_ENV_VAR
=
"CPU_VISIBLE_MEMORY_NODES"
@
dataclass
class
LogicalCPUInfo
:
id
:
int
=
-
1
physical_core
:
int
=
-
1
numa_node
:
int
=
-
1
@
classmethod
def
_int
(
cls
,
value
:
str
)
->
int
:
try
:
int_value
=
int
(
value
)
except
Exception
:
int_value
=
-
1
return
int_value
@
staticmethod
def
json_decoder
(
obj_dict
:
dict
):
id
=
obj_dict
.
get
(
"cpu"
)
physical_core
=
obj_dict
.
get
(
"core"
)
numa_node
=
obj_dict
.
get
(
"node"
)
if
not
(
id
is
None
or
physical_core
is
None
or
numa_node
is
None
):
return
LogicalCPUInfo
(
id
=
LogicalCPUInfo
.
_int
(
id
),
physical_core
=
LogicalCPUInfo
.
_int
(
physical_core
),
numa_node
=
LogicalCPUInfo
.
_int
(
numa_node
),
)
else
:
return
obj_dict
@
dataclass
class
MemoryNodeInfo
:
total_memory
:
int
=
-
1
available_memory
:
int
=
-
1
def
get_memory_affinity
(
pid
:
int
=
0
)
->
list
[
int
]:
pid
=
os
.
getpid
()
if
pid
==
0
else
pid
path
=
f
"/proc/
{
pid
}
/status"
with
open
(
path
)
as
f
:
for
line
in
f
:
if
line
.
startswith
(
"Mems_allowed_list:"
):
# Extract the string part (e.g., "0-1,3")
raw_list
=
line
.
split
(
":"
)[
1
].
strip
()
return
parse_id_list
(
raw_list
)
return
[]
def
parse_id_list
(
raw_str
:
str
)
->
list
[
int
]:
"""Parses strings like '0-2,4,7-8' into [0, 1, 2, 4, 7, 8]"""
result
:
list
[
int
]
=
[]
if
not
raw_str
:
return
result
for
part
in
raw_str
.
split
(
","
):
if
"-"
in
part
:
start
,
end
=
map
(
int
,
part
.
split
(
"-"
))
result
.
extend
(
range
(
start
,
end
+
1
))
else
:
result
.
append
(
int
(
part
))
return
sorted
(
list
(
set
(
result
)))
def
get_memory_node_info
(
node_id
:
int
=
0
)
->
MemoryNodeInfo
:
if
platform
.
system
()
==
"Darwin"
:
# MacOS has no memory node
return
MemoryNodeInfo
(
total_memory
=
psutil
.
virtual_memory
().
total
,
available_memory
=
psutil
.
virtual_memory
().
available
,
)
meminfo_path
=
f
"/sys/devices/system/node/node
{
node_id
}
/meminfo"
if
not
os
.
path
.
exists
(
meminfo_path
):
raise
RuntimeError
(
f
"
{
meminfo_path
}
doesn't exit."
)
meminfo
=
{}
with
open
(
meminfo_path
)
as
f
:
for
line
in
f
:
# Each line looks like: "Node 0 MemTotal: 97421888 kB"
parts
=
line
.
split
()
key
=
parts
[
2
].
rstrip
(
":"
)
# convert to Bytes
value
=
int
(
parts
[
3
])
*
1024
meminfo
[
key
]
=
value
total_memory
=
meminfo
[
"MemTotal"
]
free_memory
=
meminfo
[
"MemFree"
]
active_file_memory
=
meminfo
[
"Active(file)"
]
inactive_file_memory
=
meminfo
[
"Inactive(file)"
]
reclaimable_memory
=
meminfo
[
"SReclaimable"
]
available_memory
=
(
free_memory
+
active_file_memory
+
inactive_file_memory
+
reclaimable_memory
)
return
MemoryNodeInfo
(
total_memory
=
total_memory
,
available_memory
=
available_memory
,
)
def
get_allowed_cpu_list
()
->
list
[
LogicalCPUInfo
]:
cpu_list
=
_get_cpu_list
()
if
platform
.
system
()
==
"Darwin"
:
return
cpu_list
global_allowed_cpu_id_list
=
os
.
sched_getaffinity
(
0
)
logical_cpu_list
=
[
x
for
x
in
cpu_list
if
x
.
id
in
global_allowed_cpu_id_list
]
return
logical_cpu_list
def
get_visible_memory_node
()
->
list
[
int
]:
if
platform
.
system
()
==
"Darwin"
:
return
[
0
]
allowed_memory_node_list
=
get_memory_affinity
()
env_key
=
DEVICE_CONTROL_ENV_VAR
if
(
(
"VLLM_CPU_SIM_MULTI_NUMA"
not
in
os
.
environ
)
and
env_key
in
os
.
environ
and
os
.
environ
[
env_key
]
!=
""
):
visible_nodes
=
[
int
(
s
)
for
s
in
os
.
environ
[
env_key
].
split
(
","
)]
visible_nodes
=
[
node
for
node
in
visible_nodes
if
node
in
allowed_memory_node_list
]
return
visible_nodes
return
allowed_memory_node_list
@
cache
def
_get_cpu_list
()
->
list
[
LogicalCPUInfo
]:
if
platform
.
system
()
==
"Darwin"
:
# For MacOS, no user-level CPU affinity and SMT, return all CPUs
cpu_count
=
os
.
cpu_count
()
assert
cpu_count
return
[
LogicalCPUInfo
(
i
,
i
,
0
)
for
i
in
range
(
cpu_count
)]
lscpu_output
=
subprocess
.
check_output
(
"lscpu -J -e=CPU,CORE,NODE"
,
shell
=
True
,
text
=
True
)
# For platform without NUMA, replace '-' to '0'
lscpu_output
=
re
.
sub
(
r
'"node":\s*-\s*(,|\n)'
,
r
'"node": 0\1'
,
lscpu_output
)
logical_cpu_list
:
list
[
LogicalCPUInfo
]
=
json
.
loads
(
lscpu_output
,
object_hook
=
LogicalCPUInfo
.
json_decoder
)[
"cpus"
]
# Filter CPUs with invalid attributes
logical_cpu_list
=
[
x
for
x
in
logical_cpu_list
if
-
1
not
in
(
x
.
id
,
x
.
physical_core
,
x
.
numa_node
)
]
return
logical_cpu_list
vllm/utils/ompmultiprocessing.py
View file @
d02421a7
...
...
@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc
Copyright (c) 2026 Cambridge Greys Ltd
"""
import
json
import
os
import
platform
import
subprocess
def
_int
(
arg
):
"""Relaxed parsing of ints which handles a - instead of a number.
The lscpu json may contain that for nodes in some cases. If that
is the case we parse it to zero
"""
try
:
if
int
(
arg
)
>=
0
:
return
int
(
arg
)
except
ValueError
:
pass
return
0
def
parse_mask
(
mask
):
"""Expand a X-Y,Z list"""
result
=
[]
for
token
in
mask
.
split
(
","
):
try
:
start
,
finish
=
token
.
split
(
"-"
)
if
int
(
start
)
>
int
(
finish
):
raise
IndexError
(
"Invalid Indexes for cpu ranges"
)
for
cpu
in
range
(
int
(
start
),
int
(
finish
)
+
1
):
result
.
append
(
cpu
)
except
ValueError
:
result
.
append
(
int
(
token
))
return
set
(
result
)
def
_get_default_affinity
()
->
set
[
int
]:
"""Get the set of CPUs the process is allowed to run on."""
if
hasattr
(
os
,
"sched_getaffinity"
):
return
os
.
sched_getaffinity
(
0
)
# macOS does not support sched_getaffinity; fall back to cpu_count
cpu_count
=
os
.
cpu_count
()
or
1
return
set
(
range
(
cpu_count
))
def
_get_cpu_topology_json
()
->
bytes
:
"""Get CPU topology as JSON.
On Linux this uses ``lscpu -Je``. On other platforms (e.g. macOS) we
synthesize a simple topology where every logical CPU is its own core
on NUMA node 0, which is sufficient for the OMP place-list builder.
"""
if
platform
.
system
()
==
"Linux"
:
return
subprocess
.
run
([
"lscpu"
,
"-Je"
],
check
=
True
,
capture_output
=
True
).
stdout
# Fallback for non-Linux (macOS, etc.)
cpu_count
=
os
.
cpu_count
()
or
1
cpus
=
[]
for
i
in
range
(
cpu_count
):
cpus
.
append
({
"cpu"
:
str
(
i
),
"core"
:
str
(
i
),
"node"
:
"0"
})
return
json
.
dumps
({
"cpus"
:
cpus
}).
encode
()
def
enumerate_resources
(
resource_map
,
mask
=
None
,
allowed
=
None
):
"""Enumerate system resources"""
if
allowed
is
None
:
allowed
=
_get_default_affinity
()
if
mask
is
not
None
:
allowed
=
allowed
&
mask
try
:
allowed_nodes
=
parse_mask
(
os
.
environ
[
"CPU_VISIBLE_MEMORY_NODES"
])
except
KeyError
:
allowed_nodes
=
None
lscpu
:
dict
[
str
,
dict
]
=
{
"cpus"
:
{},
"cores"
:
{},
"nodes"
:
{}}
for
cpu
in
resource_map
[
"cpus"
]:
cpunum
=
int
(
cpu
[
"cpu"
])
if
(
cpunum
in
allowed
and
cpunum
>=
0
and
(
allowed_nodes
is
None
or
_int
(
cpu
[
"node"
])
in
allowed_nodes
)
):
lscpu
[
"cpus"
][
cpunum
]
=
[
cpu
]
core
=
_int
(
cpu
[
"core"
])
if
lscpu
[
"cores"
].
get
(
core
,
None
)
is
None
:
lscpu
[
"cores"
][
core
]
=
[
cpu
]
else
:
lscpu
[
"cores"
][
core
].
append
(
cpu
)
node
=
_int
(
cpu
[
"node"
])
if
lscpu
[
"nodes"
].
get
(
node
,
None
)
is
None
:
lscpu
[
"nodes"
][
node
]
=
[
cpu
]
else
:
lscpu
[
"nodes"
][
node
].
append
(
cpu
)
return
lscpu
def
produce_cpu_list
(
cpus
,
smt
=
1
):
"""Produce a CPU list with/without SMT pairs - main cpu list case"""
mask
:
list
[
int
]
=
[]
for
key
,
value
in
cpus
.
items
():
exists
=
0
for
cpu
in
mask
:
if
cpu
==
value
[
0
][
"core"
]:
exists
+=
1
break
if
exists
<
smt
:
mask
.
append
(
int
(
key
))
return
{
"mask"
:
set
(
mask
),
"available"
:
True
}
def
produce_cpu_sublist
(
scpus
,
smt
=
1
):
"""Produce a CPU list with/without SMT pairs - resource leaf case"""
cpu_list
:
list
[
dict
]
=
[]
for
value
in
scpus
:
exists
=
0
for
cpu
in
cpu_list
:
if
int
(
cpu
[
"core"
])
==
int
(
value
[
"core"
]):
exists
+=
1
break
if
exists
<
smt
:
cpu_list
.
append
(
value
)
mask
=
[]
for
cpu
in
cpu_list
:
mask
.
append
(
int
(
cpu
[
"cpu"
]))
return
{
"mask"
:
set
(
mask
),
"available"
:
True
}
def
create_omp_places
(
resources
,
strategy
,
smt
=
True
):
"""Parse CPU topology and generate possible CPU masks"""
omp_places
=
[]
if
strategy
==
"all"
:
omp_places
.
append
(
produce_cpu_list
(
resources
[
"cpus"
],
smt
))
elif
strategy
==
"cores"
:
for
value
in
resources
[
"cores"
].
values
():
omp_places
.
append
(
produce_cpu_sublist
(
value
,
smt
))
elif
strategy
==
"nodes"
:
for
value
in
resources
[
"nodes"
].
values
():
omp_places
.
append
(
produce_cpu_sublist
(
value
,
smt
))
else
:
raise
NotImplementedError
(
"Unknown strategy"
)
return
omp_places
# pylint: disable=too-few-public-methods
from
collections.abc
import
Callable
from
contextlib
import
contextmanager
from
typing
import
TYPE_CHECKING
import
vllm.utils.cpu_resource_utils
as
cr_utils
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.utils.cpu_resource_utils
import
LogicalCPUInfo
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
logger
=
init_logger
(
__name__
)
class
OMPProcessManager
:
"""OMP aware wrapper to run mp Process()"""
def
__init__
(
self
,
strategy
=
"nodes"
,
smt
=
1
,
mock
=
None
,
affinity
=
None
):
self
.
strategy
=
strategy
self
.
smt
=
smt
self
.
omp_places
=
[]
vllm_mask
=
os
.
environ
.
get
(
"VLLM_CPU_OMP_THREADS_BIND"
,
None
)
self
.
setup_omp
=
vllm_mask
!=
"nobind"
if
self
.
setup_omp
:
omp_places
=
[]
if
vllm_mask
is
not
None
:
masks
=
[]
for
spec
in
vllm_mask
.
split
(
"|"
):
masks
.
append
(
parse_mask
(
spec
))
def
__init__
(
self
,
config
:
"VllmConfig"
):
if
not
current_platform
.
is_cpu
():
return
self
.
local_world_size
=
config
.
parallel_config
.
local_world_size
self
.
local_dp_rank
=
config
.
parallel_config
.
data_parallel_rank_local
# This is a bit tricky because the internal DP size
# is always 1 for non-MoE models
self
.
internal_dp_size
=
config
.
parallel_config
.
_api_process_count
self
.
simulate_multi_node
=
os
.
environ
.
get
(
"VLLM_CPU_SIM_MULTI_NUMA"
,
"0"
)
!=
"0"
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
self
.
use_iomp
=
"libiomp"
in
ld_preload_str
or
"libomp"
in
ld_preload_str
self
.
use_gomp
=
"libgomp"
in
ld_preload_str
assert
not
(
self
.
use_iomp
and
self
.
use_gomp
)
# at least reserve 1/local_world_size(for ARM) core for scheduler
# proc as always use MP executor
# TODO: make scheduler proc sleep when idle
self
.
reserve_cpu_num
=
(
self
.
local_world_size
if
current_platform
.
get_cpu_architecture
()
==
CpuArchEnum
.
ARM
else
1
)
# reserve at one more core for nixl_connector under p/d case
if
config
.
kv_transfer_config
:
self
.
reserve_cpu_num
+=
1
if
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
is
not
None
:
if
self
.
reserve_cpu_num
>
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
:
msg
=
(
f
"VLLM_CPU_NUM_OF_RESERVED_CPU is less than "
"the minimum requirement"
f
":
{
self
.
reserve_cpu_num
}
cores"
)
logger
.
warning
(
msg
=
msg
)
self
.
reserve_cpu_num
=
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
self
.
_parse_omp_threads_bind_env
()
assert
not
self
.
simulate_multi_node
or
self
.
auto_setup
@
contextmanager
def
configure_omp_envs
(
self
,
rank
:
int
,
local_rank
:
int
):
if
not
current_platform
.
is_cpu
()
or
self
.
skip_setup
:
yield
return
envs_dict
=
{}
cpu_list
=
[
str
(
i
)
for
i
in
self
.
cpu_lists
[
local_rank
]]
envs_dict
[
"OMP_NUM_THREADS"
]
=
str
(
len
(
cpu_list
))
if
self
.
use_iomp
:
# set IOMP envs
cpu_list_str
=
","
.
join
(
cpu_list
)
envs_dict
[
"KMP_AFFINITY"
]
=
(
f
"granularity=fine,explicit,proclist=[
{
cpu_list_str
}
]"
)
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
envs_dict
[
"KMP_BLOCKTIME"
]
=
"1"
# Prevents the CPU to run into low performance state
envs_dict
[
"KMP_TPAUSE"
]
=
"0"
# Provides fine granularity parallelism
envs_dict
[
"KMP_FORKJOIN_BARRIER_PATTERN"
]
=
"dist,dist"
envs_dict
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
envs_dict
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
elif
self
.
use_gomp
:
# set GOMP envs
# likes '0 1 2 ...'
cpu_list_str
=
" "
.
join
(
cpu_list
)
envs_dict
[
"GOMP_CPU_AFFINITY"
]
=
cpu_list_str
else
:
# set OMP envs
# likes '{0,1,2,...}'
cpu_list_str
=
","
.
join
(
cpu_list
)
envs_dict
[
"OMP_PLACES"
]
=
f
"{{
{
cpu_list_str
}
}}"
envs_dict
[
"OMP_PROC_BIND"
]
=
"true"
# backup envs
old_envs_dict
=
{}
for
k
in
envs_dict
:
old_envs_dict
[
k
]
=
os
.
environ
.
get
(
k
)
try
:
# set envs
for
k
,
v
in
envs_dict
.
items
():
os
.
environ
[
k
]
=
v
yield
finally
:
# restore old envs
for
k
,
v
in
old_envs_dict
.
items
():
# type: ignore
if
v
is
None
:
os
.
environ
.
pop
(
k
,
None
)
else
:
os
.
environ
[
k
]
=
v
def
_parse_omp_threads_bind_env
(
self
):
vllm_mask
=
envs
.
VLLM_CPU_OMP_THREADS_BIND
self
.
skip_setup
=
vllm_mask
==
"nobind"
self
.
auto_setup
=
vllm_mask
==
"auto"
self
.
reserved_cpu_list
=
[]
self
.
cpu_lists
=
[]
if
self
.
auto_setup
:
# auto generate CPU lists
cpu_arch
=
current_platform
.
get_cpu_architecture
()
if
cpu_arch
==
CpuArchEnum
.
POWERPC
:
# For POWERPC SMT-8/4/2
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
[
cpu
for
cpu
in
cpus
if
cpu
.
id
%
8
<
4
]
)
elif
cpu_arch
in
(
CpuArchEnum
.
X86
,
CpuArchEnum
.
S390X
):
# For x86/S390X SMT-2, use 1 logical CPU per physical core
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
cpus
[
-
1
:]
)
elif
cpu_arch
==
CpuArchEnum
.
ARM
:
# For AArch64, no SMT, use all logical CPU
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
cpus
)
else
:
masks
=
[
None
]
if
mock
is
None
:
data
=
_get_cpu_topology_json
()
cpu_list
,
reserve_list
=
[],
[]
raise
RuntimeError
(
f
"
{
cpu_arch
}
doesn't support auto CPU binding."
)
for
item
in
cpu_list
:
self
.
cpu_lists
.
append
([
x
.
id
for
x
in
item
])
self
.
reserved_cpu_list
=
[
x
.
id
for
x
in
reserve_list
]
elif
not
self
.
skip_setup
:
# user defined CPU lists
omp_cpuids_list
=
vllm_mask
.
split
(
"|"
)
if
self
.
local_dp_rank
is
not
None
:
local_dp_rank
=
self
.
local_dp_rank
world_size
=
self
.
local_world_size
# Rank mapping [DP, PP, TP]
omp_cpuids_list
=
omp_cpuids_list
[
local_dp_rank
*
world_size
:
(
local_dp_rank
+
1
)
*
world_size
]
assert
len
(
omp_cpuids_list
)
==
self
.
local_world_size
,
(
"Given "
f
"number of CPU id list
{
omp_cpuids_list
}
doesn't match "
f
"local world size
{
self
.
local_world_size
}
."
)
# parse CPU list strings like "5,2-4" to [5, 2, 3, 4]
self
.
cpu_lists
=
[
cr_utils
.
parse_id_list
(
s
)
for
s
in
omp_cpuids_list
]
else
:
# skip
self
.
cpu_lists
=
[]
msg
=
"OpenMP thread binding info:
\n
"
for
i
in
range
(
self
.
local_world_size
):
msg
+=
f
"
\t
local_rank=
{
i
}
, core ids=
{
self
.
cpu_lists
[
i
]
}
\n
"
msg
+=
f
"
\t
reserved_cpus=
{
self
.
reserved_cpu_list
}
"
logger
.
info
(
msg
)
def
_get_autobind_cpu_ids
(
self
,
cpu_selector
:
Callable
[[
list
[
LogicalCPUInfo
]],
list
[
LogicalCPUInfo
]]
)
->
tuple
[
list
[
list
[
LogicalCPUInfo
]],
list
[
LogicalCPUInfo
]]:
"""
Return CPU ids to bind based on NUMA nodes, and CPU ids reserved for
other processes.
Currently for rank N, only CPU ids on the N-th node in available NUMA
node list will be selected.
Args:
cpu_selector: a callable object to select CPUs from a CPU list
of a physical core. The input is a LogicalCPUInfo list contains
logical CPUs of a physical CPU, sorted by the LogicalCPUInfo.id.
A selected LogicalCPUInfo list should be returned.
"""
# this memory node list has been sliced for DP offset
allowed_numa_nodes
=
cr_utils
.
get_visible_memory_node
()
logical_cpu_list
=
cr_utils
.
get_allowed_cpu_list
()
local_world_size
=
self
.
local_world_size
assert
(
len
(
allowed_numa_nodes
)
>=
local_world_size
or
self
.
simulate_multi_node
),
(
f
"Not enough allowed NUMA nodes to bind threads of "
f
"
{
local_world_size
}
local CPUWorkers. "
f
"Allowed NUMA nodes are
{
allowed_numa_nodes
}
. "
"Please try to bind threads manually or decrease DP/TP/PP."
)
# Generate OMP CPU list for each rank
cpu_lists_of_ranks
=
[]
reserved_cpu_list
=
[]
total_cpu_num
=
0
for
local_rank
in
range
(
self
.
local_world_size
):
if
not
self
.
simulate_multi_node
:
selected_numa_node
=
allowed_numa_nodes
[
local_rank
]
selected_logical_cpu_list
=
[
x
for
x
in
logical_cpu_list
if
x
.
numa_node
==
selected_numa_node
]
else
:
with
open
(
mock
,
mode
=
"rb"
)
as
jf
:
data
=
jf
.
read
()
lscpu
=
json
.
loads
(
data
)
for
mask
in
masks
:
resources
=
enumerate_resources
(
lscpu
,
mask
,
affinity
)
omp_places
.
extend
(
create_omp_places
(
resources
,
strategy
,
smt
))
self
.
omp_places
=
sorted
(
omp_places
,
key
=
lambda
p
:
"{:04d}-{:04d}"
.
format
(
len
(
p
[
"mask"
]),
max
(
p
[
"mask"
])),
reverse
=
True
,
world_size_across_dp
=
self
.
local_world_size
*
self
.
internal_dp_size
assert
len
(
logical_cpu_list
)
>=
world_size_across_dp
selected_logical_cpu_list
=
sorted
(
logical_cpu_list
,
key
=
lambda
x
:
x
.
numa_node
)
sim_cpu_num_per_node
=
(
len
(
selected_logical_cpu_list
)
//
world_size_across_dp
)
assert
self
.
local_dp_rank
is
not
None
start_idx
=
(
local_rank
+
self
.
local_world_size
*
self
.
local_dp_rank
)
*
sim_cpu_num_per_node
selected_logical_cpu_list
=
selected_logical_cpu_list
[
start_idx
:
(
start_idx
+
sim_cpu_num_per_node
)
]
# Select logical CPUs on same physical cores via cpu_selector
core_to_cpus
:
dict
[
int
,
list
[
LogicalCPUInfo
]]
=
{}
for
cpu_info
in
selected_logical_cpu_list
:
if
cpu_info
.
physical_core
not
in
core_to_cpus
:
core_to_cpus
[
cpu_info
.
physical_core
]
=
[]
core_to_cpus
[
cpu_info
.
physical_core
].
append
(
cpu_info
)
selected_logical_cpu_list
=
[]
for
cpu_list
in
core_to_cpus
.
values
():
cpu_list
=
sorted
(
cpu_list
,
key
=
lambda
x
:
x
.
id
)
selected_logical_cpu_list
.
extend
(
cpu_selector
(
cpu_list
))
# sort selected cores based on core id
selected_logical_cpu_list
=
sorted
(
selected_logical_cpu_list
,
key
=
lambda
x
:
x
.
id
)
def
run
(
self
,
what
,
*
args
,
**
kwargs
):
"""Run arg with correct OMP environment"""
if
self
.
setup_omp
:
for
place
in
self
.
omp_places
:
if
place
[
"available"
]:
reserve
=
int
(
os
.
environ
.
get
(
"VLLM_CPU_NUM_OF_RESERVED_CPU"
,
0
))
place
[
"available"
]
=
False
# pylint: disable=consider-using-f-string
os
.
environ
[
"OMP_PLACES"
]
=
"{}"
.
format
(
place
[
"mask"
])
os
.
environ
[
"OMP_NUM_THREADS"
]
=
"{}"
.
format
(
len
(
place
[
"mask"
])
-
reserve
)
os
.
environ
[
"OMP_PROC_BIND"
]
=
"TRUE"
return
what
(
*
args
,
**
kwargs
)
raise
IndexError
(
"Out of OMP places"
)
return
what
(
*
args
,
**
kwargs
)
cpu_lists_of_ranks
.
append
(
selected_logical_cpu_list
)
total_cpu_num
+=
len
(
selected_logical_cpu_list
)
# Reserve CPUs for other processes
if
total_cpu_num
<=
self
.
reserve_cpu_num
:
logger
.
warning
(
"Selected CPU core number (%s) "
"should be greater than reserved CPU core "
"number (%s)."
,
total_cpu_num
,
self
.
reserve_cpu_num
,
)
return
cpu_lists_of_ranks
,
[]
reserve_num_per_rank
=
[
self
.
reserve_cpu_num
//
self
.
local_world_size
]
*
self
.
local_world_size
# last rank first
for
i
in
range
(
self
.
local_world_size
-
1
,
self
.
local_world_size
-
1
-
self
.
reserve_cpu_num
%
self
.
local_world_size
,
-
1
,
):
reserve_num_per_rank
[
i
]
+=
1
for
i
in
range
(
self
.
local_world_size
):
num
=
reserve_num_per_rank
[
i
]
if
num
>
0
:
reserved_cpu_list
.
extend
(
cpu_lists_of_ranks
[
i
][
-
num
:])
cpu_lists_of_ranks
[
i
]
=
cpu_lists_of_ranks
[
i
][:
-
num
]
return
cpu_lists_of_ranks
,
reserved_cpu_list
vllm/v1/executor/multiproc_executor.py
View file @
d02421a7
...
...
@@ -51,6 +51,7 @@ from vllm.utils.network_utils import (
get_loopback_ip
,
get_open_port
,
)
from
vllm.utils.ompmultiprocessing
import
OMPProcessManager
from
vllm.utils.system_utils
import
(
_maybe_force_spawn
,
decorate_logs
,
...
...
@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor):
[]
if
context
.
get_start_method
()
==
"fork"
else
None
)
# For CPU backend only, to setup OpenMP threads affinity
cpu_omp_manager
=
OMPProcessManager
(
self
.
vllm_config
)
for
local_rank
in
range
(
self
.
local_world_size
):
global_rank
=
global_start_rank
+
local_rank
is_driver_worker
=
self
.
_is_driver_worker
(
global_rank
)
if
current_platform
.
is_cpu
():
om
=
current_platform
.
get_omp_manager
()
logger
.
info
(
"Configured OMP PLACES %s"
,
str
(
om
.
omp_places
))
unready_worker_handle
=
om
.
run
(
WorkerProc
.
make_worker_process
,
vllm_config
=
self
.
vllm_config
,
local_rank
=
local_rank
,
rank
=
global_rank
,
distributed_init_method
=
distributed_init_method
,
input_shm_handle
=
scheduler_output_handle
,
shared_worker_lock
=
shared_worker_lock
,
is_driver_worker
=
is_driver_worker
,
inherited_fds
=
inherited_fds
,
)
else
:
with
cpu_omp_manager
.
configure_omp_envs
(
rank
=
global_rank
,
local_rank
=
local_rank
):
unready_worker_handle
=
WorkerProc
.
make_worker_process
(
vllm_config
=
self
.
vllm_config
,
local_rank
=
local_rank
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
d02421a7
...
...
@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
# Warm up drafter for speculative decoding
if
self
.
speculative_config
and
(
self
.
speculative_config
.
uses_draft_model
()):
from
vllm.v1.spec_decode.draft_model
import
DraftModelProposer
if
isinstance
(
self
.
drafter
,
(
DraftModelProposer
)):
logger
.
info
(
"Warming up drafter model..."
)
self
.
drafter
.
dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
self
.
profile_run
()
logger
.
info
(
"Warming up done."
)
def
initialize_kv_cache
(
...
...
vllm/v1/worker/cpu_worker.py
View file @
d02421a7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
os
import
sys
from
typing
import
Any
import
psutil
import
torch
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.profiler.wrapper
import
TorchProfilerWrapper
from
vllm.utils.cpu_resource_utils
import
(
get_allowed_cpu_list
,
get_memory_node_info
,
get_visible_memory_node
,
)
from
vllm.utils.mem_utils
import
format_gib
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.v1.worker.gpu_worker
import
Worker
,
init_worker_distributed_environment
...
...
@@ -27,6 +35,46 @@ class CPUWorker(Worker):
distributed_init_method
:
str
,
is_driver_worker
:
bool
=
False
,
):
# TODO: use numactl for process setup
# TODO: optimize for `interleaved` policy
# Bind memory node
allowed_memory_nodes
=
get_visible_memory_node
()
allowed_cpu_list
=
get_allowed_cpu_list
()
cpu_core
=
allowed_cpu_list
[
0
]
# TODO: some CI hosts are not correctly set, change to assertion
# after fix
if
cpu_core
.
numa_node
not
in
allowed_memory_nodes
:
logger
.
warning
(
"Node %s is not in available memory nodes %s."
,
cpu_core
.
numa_node
,
allowed_memory_nodes
,
)
torch
.
ops
.
_C
.
init_cpu_memory_env
([
cpu_core
.
numa_node
])
memory_status
=
get_memory_node_info
(
cpu_core
.
numa_node
)
memory_fraction
=
vllm_config
.
cache_config
.
gpu_memory_utilization
self
.
requested_cpu_memory
=
math
.
ceil
(
memory_status
.
total_memory
*
memory_fraction
)
available_memory
=
memory_status
.
available_memory
if
(
vllm_config
.
cache_config
.
kv_cache_memory_bytes
is
None
and
self
.
requested_cpu_memory
>
available_memory
):
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on startup "
f
"is less than desired CPU memory utilization "
f
"(
{
vllm_config
.
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB). "
"Decrease --gpu-memory-utilization"
f
" or reduce CPU memory used by other processes."
)
super
().
__init__
(
vllm_config
,
local_rank
,
...
...
@@ -103,13 +151,69 @@ class CPUWorker(Worker):
pass
def
determine_available_memory
(
self
)
->
int
:
return
self
.
cache_config
.
cpu_kvcache_space_bytes
or
0
self
.
model_runner
.
warming_up_model
()
allowed_cpu_list
=
get_allowed_cpu_list
()
cpu_core
=
allowed_cpu_list
[
0
]
memory_status
=
get_memory_node_info
(
cpu_core
.
numa_node
)
available_memory
=
memory_status
.
available_memory
explicit_kv_cache_size
=
self
.
cache_config
.
kv_cache_memory_bytes
kv_cache_size
=
None
msg
=
None
if
explicit_kv_cache_size
is
not
None
:
if
explicit_kv_cache_size
>
available_memory
:
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on kv cache"
f
" allocation is less than requested memory for kv "
f
"(
{
format_gib
(
explicit_kv_cache_size
)
}
GiB). "
"Decrease --kv-cache-memory-bytes, VLLM_CPU_KVCACHE_SPACE, "
"or reduce CPU memory used by other processes."
)
kv_cache_size
=
explicit_kv_cache_size
msg
=
(
f
"Explicitly set (
{
format_gib
(
kv_cache_size
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
) GiB for KV cache "
f
"on node
{
cpu_core
.
numa_node
}
."
)
else
:
consumed_memory
=
psutil
.
Process
(
os
.
getpid
()).
memory_info
().
rss
requested_memory_for_kv
=
int
(
self
.
requested_cpu_memory
-
consumed_memory
)
if
(
requested_memory_for_kv
<=
0
or
requested_memory_for_kv
>
available_memory
):
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on kv cache"
f
" allocation is less than requested memory for kv "
f
"(
{
format_gib
(
requested_memory_for_kv
)
}
/"
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB). "
"Reduce CPU memory used by other processes."
)
kv_cache_size
=
requested_memory_for_kv
msg
=
(
f
"Auto set (
{
format_gib
(
kv_cache_size
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
) GiB for KV cache "
f
"on node
{
cpu_core
.
numa_node
}
, with "
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB requested memory"
f
" for the worker.
{
format_gib
(
consumed_memory
)
}
GiB"
f
" memory was consumed by non-kv usages."
)
logger
.
info
(
msg
)
return
kv_cache_size
def
compile_or_warm_up_model
(
self
)
->
CompilationTimes
:
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
self
.
model_runner
.
warming_up_model
()
# Note: the model has been compiled in determine_available_memory
()
return
CompilationTimes
(
language_model
=
self
.
compilation_config
.
compilation_time
,
encoder
=
self
.
compilation_config
.
encoder_compilation_time
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment