Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d02421a7
Unverified
Commit
d02421a7
authored
Apr 17, 2026
by
Li, Jiang
Committed by
GitHub
Apr 17, 2026
Browse files
[CPU] Refactor CPU affinity and memory management (#39781)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
b1dc87a0
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
723 additions
and
425 deletions
+723
-425
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
...ite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+19
-19
.github/workflows/macos-smoke-test.yml
.github/workflows/macos-smoke-test.yml
+1
-0
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+15
-14
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+4
-0
csrc/cpu/utils.cpp
csrc/cpu/utils.cpp
+73
-6
docker/Dockerfile.cpu
docker/Dockerfile.cpu
+2
-1
tests/models/language/generation/test_common.py
tests/models/language/generation/test_common.py
+1
-6
vllm/config/cache.py
vllm/config/cache.py
+0
-3
vllm/platforms/cpu.py
vllm/platforms/cpu.py
+51
-157
vllm/utils/cpu_resource_utils.py
vllm/utils/cpu_resource_utils.py
+173
-0
vllm/utils/ompmultiprocessing.py
vllm/utils/ompmultiprocessing.py
+271
-187
vllm/v1/executor/multiproc_executor.py
vllm/v1/executor/multiproc_executor.py
+6
-15
vllm/v1/worker/cpu_model_runner.py
vllm/v1/worker/cpu_model_runner.py
+1
-15
vllm/v1/worker/cpu_worker.py
vllm/v1/worker/cpu_worker.py
+106
-2
No files found.
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
View file @
d02421a7
...
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
...
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
exit
1
exit
1
fi
fi
#
echo "--- DP+TP"
echo
"--- DP+TP"
#
vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-dp
=
2
--max-model-len
=
4096 &
#
server_pid=$!
server_pid
=
$!
#
timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
timeout
600 bash
-c
"until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done"
||
exit
1
#
vllm bench serve \
vllm bench serve
\
#
--backend vllm \
--backend
vllm
\
#
--dataset-name random \
--dataset-name
random
\
#
--model meta-llama/Llama-3.2-3B-Instruct \
--model
meta-llama/Llama-3.2-3B-Instruct
\
#
--num-prompts 20 \
--num-prompts
20
\
#
--result-dir ./test_results \
--result-dir
./test_results
\
#
--result-filename dp_pp.json \
--result-filename
dp_pp.json
\
#
--save-result \
--save-result
\
#
--endpoint /v1/completions
--endpoint
/v1/completions
#
kill -s SIGTERM $server_pid; wait $server_pid || true
kill
-s
SIGTERM
$server_pid
;
wait
$server_pid
||
true
#
failed_req=$(jq '.failed' ./test_results/dp_pp.json)
failed_req
=
$(
jq
'.failed'
./test_results/dp_pp.json
)
#
if [ "$failed_req" -ne 0 ]; then
if
[
"
$failed_req
"
-ne
0
]
;
then
#
echo "Some requests were failed!"
echo
"Some requests were failed!"
#
exit 1
exit
1
#
fi
fi
.github/workflows/macos-smoke-test.yml
View file @
d02421a7
...
@@ -45,6 +45,7 @@ jobs:
...
@@ -45,6 +45,7 @@ jobs:
-
name
:
Smoke test vllm serve
-
name
:
Smoke test vllm serve
run
:
|
run
:
|
# Start server in background
# Start server in background
VLLM_CPU_KVCACHE_SPACE=1 \
vllm serve Qwen/Qwen3-0.6B \
vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2K \
--max-model-len=2K \
--load-format=dummy \
--load-format=dummy \
...
...
cmake/cpu_extension.cmake
View file @
d02421a7
...
@@ -30,6 +30,21 @@ else()
...
@@ -30,6 +30,21 @@ else()
list
(
APPEND CXX_COMPILE_FLAGS
list
(
APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-fopenmp"
"-DVLLM_CPU_EXTENSION"
)
"-DVLLM_CPU_EXTENSION"
)
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim
(
VLLM_TORCH_GOMP_SHIM_DIR
)
find_library
(
OPEN_MP
NAMES gomp
PATHS
${
VLLM_TORCH_GOMP_SHIM_DIR
}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if
(
OPEN_MP
)
set
(
ENV{LD_LIBRARY_PATH}
"
${
VLLM_TORCH_GOMP_SHIM_DIR
}
:$ENV{LD_LIBRARY_PATH}"
)
endif
()
endif
()
endif
()
if
(
NOT MACOSX_FOUND
)
if
(
NOT MACOSX_FOUND
)
...
@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
...
@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
if
(
NOT NPROC
)
if
(
NOT NPROC
)
set
(
NPROC 4
)
set
(
NPROC 4
)
endif
()
endif
()
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim
(
VLLM_TORCH_GOMP_SHIM_DIR
)
find_library
(
OPEN_MP
NAMES gomp
PATHS
${
VLLM_TORCH_GOMP_SHIM_DIR
}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if
(
OPEN_MP
)
set
(
ENV{LD_LIBRARY_PATH}
"
${
VLLM_TORCH_GOMP_SHIM_DIR
}
:$ENV{LD_LIBRARY_PATH}"
)
endif
()
# Fetch and populate ACL
# Fetch and populate ACL
if
(
DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY
"$ENV{ACL_ROOT_DIR}"
)
if
(
DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY
"$ENV{ACL_ROOT_DIR}"
)
...
...
csrc/cpu/torch_bindings.cpp
View file @
d02421a7
...
@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc,
...
@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc,
torch
::
Tensor
slot_mapping
,
torch
::
Tensor
slot_mapping
,
const
int64_t
block_size
);
const
int64_t
block_size
);
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
);
namespace
cpu_utils
{
namespace
cpu_utils
{
void
eagle_prepare_inputs_padded_kernel_impl
(
void
eagle_prepare_inputs_padded_kernel_impl
(
const
torch
::
Tensor
&
cu_num_draft_tokens
,
const
torch
::
Tensor
&
cu_num_draft_tokens
,
...
@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
...
@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"block_size) -> ()"
,
"block_size) -> ()"
,
&
compute_slot_mapping_kernel_impl
);
&
compute_slot_mapping_kernel_impl
);
ops
.
def
(
"init_cpu_memory_env(SymInt[] node_ids) -> ()"
,
&
init_cpu_memory_env
);
// Speculative decoding kernels
// Speculative decoding kernels
ops
.
def
(
ops
.
def
(
"eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, "
"eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, "
...
...
csrc/cpu/utils.cpp
View file @
d02421a7
...
@@ -13,13 +13,80 @@
...
@@ -13,13 +13,80 @@
#include "cpu/utils.hpp"
#include "cpu/utils.hpp"
#ifdef VLLM_NUMA_DISABLED
#ifdef VLLM_NUMA_DISABLED
std
::
string
init_cpu_threads_env
(
const
std
::
string
&
cpu_ids
)
{
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
)
{}
return
std
::
string
(
#else
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
void
init_cpu_memory_env
(
std
::
vector
<
int64_t
>
node_ids
)
{
"no effect to setup thread affinity."
);
// Memory node binding
}
if
(
numa_available
()
!=
-
1
)
{
// Concatenate all node_ids into a single comma-separated string
if
(
!
node_ids
.
empty
())
{
std
::
string
node_ids_str
;
for
(
const
int
node_id
:
node_ids
)
{
if
(
!
node_ids_str
.
empty
())
{
node_ids_str
+=
","
;
}
node_ids_str
+=
std
::
to_string
(
node_id
);
}
#endif
bitmask
*
mask
=
numa_parse_nodestring
(
node_ids_str
.
c_str
());
bitmask
*
src_mask
=
numa_get_mems_allowed
();
int
pid
=
getpid
();
if
(
mask
&&
src_mask
)
{
// move all existing pages to the specified numa node.
*
(
src_mask
->
maskp
)
=
*
(
src_mask
->
maskp
)
^
*
(
mask
->
maskp
);
int
page_num
=
numa_migrate_pages
(
pid
,
src_mask
,
mask
);
if
(
page_num
==
-
1
)
{
TORCH_WARN
(
"numa_migrate_pages failed. errno: "
+
std
::
to_string
(
errno
));
}
// Restrict memory allocation to the selected NUMA node(s).
// Enhances memory locality for the threads bound to those NUMA CPUs.
if
(
node_ids
.
size
()
>
1
)
{
errno
=
0
;
numa_set_interleave_mask
(
mask
);
if
(
errno
!=
0
)
{
TORCH_WARN
(
"numa_set_interleave_mask failed. errno: "
+
std
::
to_string
(
errno
));
}
else
{
TORCH_WARN
(
"NUMA binding: Using INTERLEAVE policy for memory "
"allocation across multiple NUMA nodes (nodes: "
+
node_ids_str
+
"). Memory allocations will be "
"interleaved across the specified NUMA nodes."
);
}
}
else
{
errno
=
0
;
numa_set_membind
(
mask
);
if
(
errno
!=
0
)
{
TORCH_WARN
(
"numa_set_membind failed. errno: "
+
std
::
to_string
(
errno
));
}
else
{
TORCH_WARN
(
"NUMA binding: Using MEMBIND policy for memory "
"allocation on the NUMA nodes ("
+
node_ids_str
+
"). Memory allocations will be "
"strictly bound to these NUMA nodes."
);
}
}
numa_set_strict
(
1
);
numa_free_nodemask
(
mask
);
numa_free_nodemask
(
src_mask
);
}
else
{
TORCH_WARN
(
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: "
+
std
::
to_string
(
errno
));
}
}
}
}
#endif // VLLM_NUMA_DISABLED
namespace
cpu_utils
{
namespace
cpu_utils
{
ScratchPadManager
::
ScratchPadManager
()
:
size_
(
0
),
ptr_
(
nullptr
)
{
ScratchPadManager
::
ScratchPadManager
()
:
size_
(
0
),
ptr_
(
nullptr
)
{
...
...
docker/Dockerfile.cpu
View file @
d02421a7
...
@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
...
@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt
COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \
uv pip install -r requirements/lint.txt && \
uv pip install -r requirements/test/cpu.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg
pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"]
ENTRYPOINT ["bash"]
...
...
tests/models/language/generation/test_common.py
View file @
d02421a7
...
@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [
...
@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [
),
),
pytest
.
param
(
pytest
.
param
(
"openai-community/gpt2"
,
# gpt2
"openai-community/gpt2"
,
# gpt2
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
marks
=
[
pytest
.
mark
.
core_model
],
),
),
pytest
.
param
(
"Milos/slovak-gpt-j-405M"
),
# gptj
pytest
.
param
(
"Milos/slovak-gpt-j-405M"
),
# gptj
pytest
.
param
(
"bigcode/tiny_starcoder_py"
),
# gpt_bigcode
pytest
.
param
(
"bigcode/tiny_starcoder_py"
),
# gpt_bigcode
...
@@ -143,11 +143,6 @@ def test_models(
...
@@ -143,11 +143,6 @@ def test_models(
# in parts of the operators
# in parts of the operators
pytest
.
skip
(
f
"Skipping '
{
model
}
' model test with AITER kernel."
)
pytest
.
skip
(
f
"Skipping '
{
model
}
' model test with AITER kernel."
)
if
current_platform
.
is_cpu
()
and
model
in
(
"openai-community/gpt2"
,):
# These models are sensitive to the rounding error
# Fuse ops to reduce rounding
monkeypatch
.
setenv
(
"VLLM_CPU_CI_ENV"
,
"0"
)
with
hf_runner
(
model
)
as
hf_model
:
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
example_prompts
,
max_tokens
,
num_logprobs
...
...
vllm/config/cache.py
View file @
d02421a7
...
@@ -101,8 +101,6 @@ class CacheConfig:
...
@@ -101,8 +101,6 @@ class CacheConfig:
kv_cache_dtype_skip_layers
:
list
[
str
]
=
field
(
default_factory
=
list
)
kv_cache_dtype_skip_layers
:
list
[
str
]
=
field
(
default_factory
=
list
)
"""Layer patterns to skip KV cache quantization. Accepts layer indices
"""Layer patterns to skip KV cache quantization. Accepts layer indices
(e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
(e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
cpu_kvcache_space_bytes
:
int
|
None
=
None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded
:
int
|
None
=
None
mamba_page_size_padded
:
int
|
None
=
None
""" Optional override for mamba page size; used by hybrid mamba/attention
""" Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size."""
models to ensure exact alignment with attention page size."""
...
@@ -183,7 +181,6 @@ class CacheConfig:
...
@@ -183,7 +181,6 @@ class CacheConfig:
"num_gpu_blocks_override"
,
"num_gpu_blocks_override"
,
"enable_prefix_caching"
,
"enable_prefix_caching"
,
"prefix_caching_hash_algo"
,
"prefix_caching_hash_algo"
,
"cpu_kvcache_space_bytes"
,
"mamba_page_size_padded"
,
"mamba_page_size_padded"
,
"user_specified_block_size"
,
"user_specified_block_size"
,
"user_specified_mamba_block_size"
,
"user_specified_mamba_block_size"
,
...
...
vllm/platforms/cpu.py
View file @
d02421a7
...
@@ -6,15 +6,16 @@ import os
...
@@ -6,15 +6,16 @@ import os
import
platform
import
platform
import
subprocess
import
subprocess
import
sys
import
sys
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
from
typing
import
TYPE_CHECKING
import
psutil
import
torch
import
torch
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils.ompmultiprocessing
import
OMPProcessManager
from
vllm.utils.cpu_resource_utils
import
(
DEVICE_CONTROL_ENV_VAR
,
get_memory_node_info
,
)
from
vllm.utils.mem_constants
import
GiB_bytes
from
vllm.utils.torch_utils
import
is_quantized_kv_cache
from
vllm.utils.torch_utils
import
is_quantized_kv_cache
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
from
vllm.v1.attention.backends.registry
import
AttentionBackendEnum
...
@@ -38,49 +39,13 @@ def get_max_threads(pid=0):
...
@@ -38,49 +39,13 @@ def get_max_threads(pid=0):
raise
NotImplementedError
(
"Unsupported OS"
)
raise
NotImplementedError
(
"Unsupported OS"
)
@
dataclass
class
LogicalCPUInfo
:
id
:
int
=
-
1
physical_core
:
int
=
-
1
numa_node
:
int
=
-
1
@
classmethod
def
_int
(
cls
,
value
:
str
)
->
int
:
try
:
int_value
=
int
(
value
)
except
Exception
:
int_value
=
-
1
return
int_value
@
staticmethod
def
json_decoder
(
obj_dict
:
dict
):
id
=
obj_dict
.
get
(
"cpu"
)
physical_core
=
obj_dict
.
get
(
"core"
)
numa_node
=
obj_dict
.
get
(
"node"
)
if
not
(
id
is
None
or
physical_core
is
None
or
numa_node
is
None
):
return
LogicalCPUInfo
(
id
=
LogicalCPUInfo
.
_int
(
id
),
physical_core
=
LogicalCPUInfo
.
_int
(
physical_core
),
numa_node
=
LogicalCPUInfo
.
_int
(
numa_node
),
)
else
:
return
obj_dict
class
CpuPlatform
(
Platform
):
class
CpuPlatform
(
Platform
):
_enum
=
PlatformEnum
.
CPU
_enum
=
PlatformEnum
.
CPU
device_name
:
str
=
"cpu"
device_name
:
str
=
"cpu"
device_type
:
str
=
"cpu"
device_type
:
str
=
"cpu"
dispatch_key
:
str
=
"CPU"
dispatch_key
:
str
=
"CPU"
dist_backend
:
str
=
"gloo"
dist_backend
:
str
=
"gloo"
device_control_env_var
=
"CPU_VISIBLE_MEMORY_NODES"
device_control_env_var
=
DEVICE_CONTROL_ENV_VAR
omp_process_manager
=
None
# Simultaneous Multithreading (SMT) level for OpenMP:
# 4 on PowerPC, 1 on non-PowerPC architectures
smt
=
1
global_cpu_mask
=
None
simulate_numa
=
int
(
os
.
environ
.
get
(
"_SIM_MULTI_NUMA"
,
0
))
@
property
@
property
def
supported_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
def
supported_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
...
@@ -123,29 +88,9 @@ class CpuPlatform(Platform):
...
@@ -123,29 +88,9 @@ class CpuPlatform(Platform):
@
classmethod
@
classmethod
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
def
get_device_total_memory
(
cls
,
device_id
:
int
=
0
)
->
int
:
from
vllm.utils.mem_constants
import
GiB_bytes
meminfo
=
get_memory_node_info
(
device_id
)
from
vllm.utils.mem_utils
import
format_gib
kv_cache_space
=
envs
.
VLLM_CPU_KVCACHE_SPACE
node_dir
=
"/sys/devices/system/node"
if
kv_cache_space
is
None
:
nodes
=
(
[
d
for
d
in
os
.
listdir
(
node_dir
)
if
d
.
startswith
(
"node"
)]
if
os
.
path
.
exists
(
node_dir
)
else
[]
)
num_numa_nodes
=
len
(
nodes
)
or
1
free_cpu_memory
=
psutil
.
virtual_memory
().
total
//
num_numa_nodes
DEFAULT_CPU_MEM_UTILIZATION
=
0.5
kv_cache_space
=
int
(
free_cpu_memory
*
DEFAULT_CPU_MEM_UTILIZATION
)
logger
.
warning_once
(
"VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache."
,
format_gib
(
kv_cache_space
),
)
else
:
kv_cache_space
*=
GiB_bytes
return
kv_cache_space
return
meminfo
.
total_memory
@
classmethod
@
classmethod
def
set_device
(
cls
,
device
:
torch
.
device
)
->
None
:
def
set_device
(
cls
,
device
:
torch
.
device
)
->
None
:
...
@@ -180,6 +125,12 @@ class CpuPlatform(Platform):
...
@@ -180,6 +125,12 @@ class CpuPlatform(Platform):
"otherwise the performance is not optimized."
"otherwise the performance is not optimized."
)
)
# Lagecy setting
env_key
=
"VLLM_CPU_KVCACHE_SPACE"
if
env_key
in
os
.
environ
and
os
.
environ
[
env_key
]
!=
""
:
kv_cache_space
=
int
(
os
.
environ
[
env_key
])
cache_config
.
kv_cache_memory_bytes
=
kv_cache_space
*
GiB_bytes
scheduler_config
=
vllm_config
.
scheduler_config
scheduler_config
=
vllm_config
.
scheduler_config
# async scheduling is not required on CPU
# async scheduling is not required on CPU
scheduler_config
.
async_scheduling
=
False
scheduler_config
.
async_scheduling
=
False
...
@@ -198,8 +149,6 @@ class CpuPlatform(Platform):
...
@@ -198,8 +149,6 @@ class CpuPlatform(Platform):
)
)
cache_config
.
cache_dtype
=
"auto"
cache_config
.
cache_dtype
=
"auto"
cache_config
.
cpu_kvcache_space_bytes
=
CpuPlatform
.
get_device_total_memory
()
parallel_config
=
vllm_config
.
parallel_config
parallel_config
=
vllm_config
.
parallel_config
# OMP requires the MP executor to function correctly, UniProc is not
# OMP requires the MP executor to function correctly, UniProc is not
# supported as it is not possible to set the OMP environment correctly
# supported as it is not possible to set the OMP environment correctly
...
@@ -278,21 +227,45 @@ class CpuPlatform(Platform):
...
@@ -278,21 +227,45 @@ class CpuPlatform(Platform):
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
os
.
environ
[
"TORCHINDUCTOR_CPP_DYNAMIC_THREADS"
]
=
"1"
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
# Intel and CLANG OpenMP setting
if
"libiomp5.so"
in
ld_preload_str
or
"libomp5"
in
ld_preload_str
:
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
os
.
environ
[
"KMP_BLOCKTIME"
]
=
"1"
# Prevents the CPU to run into low performance state
os
.
environ
[
"KMP_TPAUSE"
]
=
"0"
# Provides fine granularity parallelism
os
.
environ
[
"KMP_FORKJOIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
os
.
environ
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
cpu_architecture
=
Platform
.
get_cpu_architecture
()
cpu_architecture
=
Platform
.
get_cpu_architecture
()
if
(
platform
.
system
()
==
"Linux"
and
cpu_architecture
in
(
CpuArchEnum
.
ARM
,
CpuArchEnum
.
POWERPC
,
CpuArchEnum
.
X86
)
and
not
(
"libomp"
in
ld_preload_str
or
"libgomp"
in
ld_preload_str
or
"libiomp"
in
ld_preload_str
)
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
# one core will be properly utilized when we thread-bind
# See: https://github.com/vllm-project/vllm/issues/27369
# TODO: Remove once:
# https://github.com/pytorch/pytorch/issues/166087 is fixed
# We need to find the location of PyTorch's libgomp
torch_pkg
=
os
.
path
.
dirname
(
torch
.
__file__
)
site_root
=
os
.
path
.
dirname
(
torch_pkg
)
# Search both torch.libs and torch/lib - See:
# https://github.com/vllm-project/vllm/issues/30470
torch_libs_paths
=
[
os
.
path
.
join
(
site_root
,
"torch.libs"
),
os
.
path
.
join
(
torch_pkg
,
"lib"
),
]
pytorch_libgomp_so_candidates
=
[]
for
torch_libs
in
torch_libs_paths
:
pytorch_libgomp_so_candidates
.
extend
(
glob
.
glob
(
os
.
path
.
join
(
torch_libs
,
"libgomp*.so*"
))
)
if
pytorch_libgomp_so_candidates
:
pytorch_libgomp_so
=
pytorch_libgomp_so_candidates
[
0
]
if
ld_preload_str
:
ld_preload_str
+=
":"
ld_preload_str
+=
pytorch_libgomp_so
os
.
environ
[
"LD_PRELOAD"
]
=
ld_preload_str
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
# memory allocation overhead
# memory allocation overhead
if
(
if
(
...
@@ -331,13 +304,6 @@ class CpuPlatform(Platform):
...
@@ -331,13 +304,6 @@ class CpuPlatform(Platform):
vllm_config
.
model_config
.
max_model_len
,
vllm_config
.
model_config
.
max_model_len
,
vllm_config
.
scheduler_config
.
DEFAULT_MAX_NUM_BATCHED_TOKENS
,
vllm_config
.
scheduler_config
.
DEFAULT_MAX_NUM_BATCHED_TOKENS
,
)
)
# CI specific "quick" NUMA simulation - split all available CPUs
# into a fake NUMA topology
if
os
.
environ
.
get
(
"VLLM_CPU_SIM_MULTI_NUMA"
,
None
)
is
not
None
:
os
.
environ
[
"_SIM_MULTI_NUMA"
]
=
str
(
vllm_config
.
parallel_config
.
world_size
*
vllm_config
.
parallel_config
.
_api_process_count
)
@
classmethod
@
classmethod
def
update_block_size_for_backend
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
def
update_block_size_for_backend
(
cls
,
vllm_config
:
"VllmConfig"
)
->
None
:
...
@@ -345,78 +311,6 @@ class CpuPlatform(Platform):
...
@@ -345,78 +311,6 @@ class CpuPlatform(Platform):
# Move that logic here so block_size is chosen by the backend.
# Move that logic here so block_size is chosen by the backend.
pass
pass
@
classmethod
def
get_omp_manager
(
cls
)
->
OMPProcessManager
:
# initialise the OMP resource management if need be and return the manager
if
cls
.
omp_process_manager
is
None
:
if
cls
.
get_cpu_architecture
()
==
CpuArchEnum
.
POWERPC
:
cls
.
smt
=
4
cls
.
omp_process_manager
=
OMPProcessManager
(
affinity
=
cls
.
get_global_cpu_mask
(),
smt
=
cls
.
smt
)
# we need to fix up the topology returned by the OMP Manager for
# simulated NUMA environments in CI
if
cls
.
simulate_numa
>
0
:
logger
.
info
(
"Adjusting numa topology to resemble at least %d nodes"
,
int
(
cls
.
simulate_numa
),
)
om
=
cls
.
omp_process_manager
while
len
(
om
.
omp_places
)
<
cls
.
simulate_numa
:
new_omp_places
=
[]
touched
=
False
for
omp_place
in
om
.
omp_places
:
if
len
(
omp_place
[
"mask"
])
>
1
:
touched
=
True
cpu_list
=
sorted
(
list
(
omp_place
[
"mask"
]))
new_omp_places
.
append
(
{
"mask"
:
set
(
cpu_list
[
0
:
int
(
len
(
cpu_list
)
/
2
)]),
"available"
:
True
,
}
)
new_omp_places
.
append
(
{
"mask"
:
set
(
cpu_list
[
int
(
len
(
cpu_list
)
/
2
)
:]),
"available"
:
True
,
}
)
if
touched
:
om
.
omp_places
=
new_omp_places
else
:
raise
ValueError
(
"Cannot split the existing NUMA topology to match "
"simulation requirements"
)
return
cls
.
omp_process_manager
@
classmethod
def
get_global_cpu_mask
(
cls
)
->
set
[
int
]:
# get global cpu mask
if
cls
.
global_cpu_mask
is
None
:
if
hasattr
(
os
,
"sched_getaffinity"
):
cls
.
global_cpu_mask
=
os
.
sched_getaffinity
(
0
)
else
:
# macOS does not support sched_getaffinity
cpu_count
=
os
.
cpu_count
()
or
1
cls
.
global_cpu_mask
=
set
(
range
(
cpu_count
))
return
cls
.
global_cpu_mask
@
classmethod
def
reserve_cpus
(
cls
,
reserve
:
set
[
int
])
->
bool
:
# remove CPUs from global mask, for now there is no "release" mechanism
if
cls
.
omp_process_manager
is
not
None
:
for
place
in
cls
.
omp_process_manager
.
omp_places
:
if
not
place
[
"available"
]:
return
False
cls
.
global_cpu_mask
=
cls
.
get_global_cpu_mask
()
-
reserve
# reinitialize OMP resource management
cls
.
omp_process_manager
=
OMPProcessManager
(
affinity
=
cls
.
global_cpu_mask
,
smt
=
cls
.
smt
)
return
True
@
classmethod
@
classmethod
def
discover_numa_topology
(
cls
)
->
list
[
list
[
int
]]:
def
discover_numa_topology
(
cls
)
->
list
[
list
[
int
]]:
"""
"""
...
...
vllm/utils/cpu_resource_utils.py
0 → 100644
View file @
d02421a7
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
os
import
platform
import
subprocess
from
dataclasses
import
dataclass
from
functools
import
cache
import
psutil
import
regex
as
re
DEVICE_CONTROL_ENV_VAR
=
"CPU_VISIBLE_MEMORY_NODES"
@
dataclass
class
LogicalCPUInfo
:
id
:
int
=
-
1
physical_core
:
int
=
-
1
numa_node
:
int
=
-
1
@
classmethod
def
_int
(
cls
,
value
:
str
)
->
int
:
try
:
int_value
=
int
(
value
)
except
Exception
:
int_value
=
-
1
return
int_value
@
staticmethod
def
json_decoder
(
obj_dict
:
dict
):
id
=
obj_dict
.
get
(
"cpu"
)
physical_core
=
obj_dict
.
get
(
"core"
)
numa_node
=
obj_dict
.
get
(
"node"
)
if
not
(
id
is
None
or
physical_core
is
None
or
numa_node
is
None
):
return
LogicalCPUInfo
(
id
=
LogicalCPUInfo
.
_int
(
id
),
physical_core
=
LogicalCPUInfo
.
_int
(
physical_core
),
numa_node
=
LogicalCPUInfo
.
_int
(
numa_node
),
)
else
:
return
obj_dict
@
dataclass
class
MemoryNodeInfo
:
total_memory
:
int
=
-
1
available_memory
:
int
=
-
1
def
get_memory_affinity
(
pid
:
int
=
0
)
->
list
[
int
]:
pid
=
os
.
getpid
()
if
pid
==
0
else
pid
path
=
f
"/proc/
{
pid
}
/status"
with
open
(
path
)
as
f
:
for
line
in
f
:
if
line
.
startswith
(
"Mems_allowed_list:"
):
# Extract the string part (e.g., "0-1,3")
raw_list
=
line
.
split
(
":"
)[
1
].
strip
()
return
parse_id_list
(
raw_list
)
return
[]
def
parse_id_list
(
raw_str
:
str
)
->
list
[
int
]:
"""Parses strings like '0-2,4,7-8' into [0, 1, 2, 4, 7, 8]"""
result
:
list
[
int
]
=
[]
if
not
raw_str
:
return
result
for
part
in
raw_str
.
split
(
","
):
if
"-"
in
part
:
start
,
end
=
map
(
int
,
part
.
split
(
"-"
))
result
.
extend
(
range
(
start
,
end
+
1
))
else
:
result
.
append
(
int
(
part
))
return
sorted
(
list
(
set
(
result
)))
def
get_memory_node_info
(
node_id
:
int
=
0
)
->
MemoryNodeInfo
:
if
platform
.
system
()
==
"Darwin"
:
# MacOS has no memory node
return
MemoryNodeInfo
(
total_memory
=
psutil
.
virtual_memory
().
total
,
available_memory
=
psutil
.
virtual_memory
().
available
,
)
meminfo_path
=
f
"/sys/devices/system/node/node
{
node_id
}
/meminfo"
if
not
os
.
path
.
exists
(
meminfo_path
):
raise
RuntimeError
(
f
"
{
meminfo_path
}
doesn't exit."
)
meminfo
=
{}
with
open
(
meminfo_path
)
as
f
:
for
line
in
f
:
# Each line looks like: "Node 0 MemTotal: 97421888 kB"
parts
=
line
.
split
()
key
=
parts
[
2
].
rstrip
(
":"
)
# convert to Bytes
value
=
int
(
parts
[
3
])
*
1024
meminfo
[
key
]
=
value
total_memory
=
meminfo
[
"MemTotal"
]
free_memory
=
meminfo
[
"MemFree"
]
active_file_memory
=
meminfo
[
"Active(file)"
]
inactive_file_memory
=
meminfo
[
"Inactive(file)"
]
reclaimable_memory
=
meminfo
[
"SReclaimable"
]
available_memory
=
(
free_memory
+
active_file_memory
+
inactive_file_memory
+
reclaimable_memory
)
return
MemoryNodeInfo
(
total_memory
=
total_memory
,
available_memory
=
available_memory
,
)
def
get_allowed_cpu_list
()
->
list
[
LogicalCPUInfo
]:
cpu_list
=
_get_cpu_list
()
if
platform
.
system
()
==
"Darwin"
:
return
cpu_list
global_allowed_cpu_id_list
=
os
.
sched_getaffinity
(
0
)
logical_cpu_list
=
[
x
for
x
in
cpu_list
if
x
.
id
in
global_allowed_cpu_id_list
]
return
logical_cpu_list
def
get_visible_memory_node
()
->
list
[
int
]:
if
platform
.
system
()
==
"Darwin"
:
return
[
0
]
allowed_memory_node_list
=
get_memory_affinity
()
env_key
=
DEVICE_CONTROL_ENV_VAR
if
(
(
"VLLM_CPU_SIM_MULTI_NUMA"
not
in
os
.
environ
)
and
env_key
in
os
.
environ
and
os
.
environ
[
env_key
]
!=
""
):
visible_nodes
=
[
int
(
s
)
for
s
in
os
.
environ
[
env_key
].
split
(
","
)]
visible_nodes
=
[
node
for
node
in
visible_nodes
if
node
in
allowed_memory_node_list
]
return
visible_nodes
return
allowed_memory_node_list
@
cache
def
_get_cpu_list
()
->
list
[
LogicalCPUInfo
]:
if
platform
.
system
()
==
"Darwin"
:
# For MacOS, no user-level CPU affinity and SMT, return all CPUs
cpu_count
=
os
.
cpu_count
()
assert
cpu_count
return
[
LogicalCPUInfo
(
i
,
i
,
0
)
for
i
in
range
(
cpu_count
)]
lscpu_output
=
subprocess
.
check_output
(
"lscpu -J -e=CPU,CORE,NODE"
,
shell
=
True
,
text
=
True
)
# For platform without NUMA, replace '-' to '0'
lscpu_output
=
re
.
sub
(
r
'"node":\s*-\s*(,|\n)'
,
r
'"node": 0\1'
,
lscpu_output
)
logical_cpu_list
:
list
[
LogicalCPUInfo
]
=
json
.
loads
(
lscpu_output
,
object_hook
=
LogicalCPUInfo
.
json_decoder
)[
"cpus"
]
# Filter CPUs with invalid attributes
logical_cpu_list
=
[
x
for
x
in
logical_cpu_list
if
-
1
not
in
(
x
.
id
,
x
.
physical_core
,
x
.
numa_node
)
]
return
logical_cpu_list
vllm/utils/ompmultiprocessing.py
View file @
d02421a7
...
@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc
...
@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc
Copyright (c) 2026 Cambridge Greys Ltd
Copyright (c) 2026 Cambridge Greys Ltd
"""
"""
import
json
import
os
import
os
import
platform
from
collections.abc
import
Callable
import
subprocess
from
contextlib
import
contextmanager
from
typing
import
TYPE_CHECKING
def
_int
(
arg
):
import
vllm.utils.cpu_resource_utils
as
cr_utils
"""Relaxed parsing of ints which handles a - instead of a number.
from
vllm
import
envs
The lscpu json may contain that for nodes in some cases. If that
from
vllm.logger
import
init_logger
is the case we parse it to zero
from
vllm.platforms
import
CpuArchEnum
,
current_platform
"""
from
vllm.utils.cpu_resource_utils
import
LogicalCPUInfo
try
:
if
int
(
arg
)
>=
0
:
if
TYPE_CHECKING
:
return
int
(
arg
)
from
vllm.config
import
VllmConfig
except
ValueError
:
pass
logger
=
init_logger
(
__name__
)
return
0
def
parse_mask
(
mask
):
"""Expand a X-Y,Z list"""
result
=
[]
for
token
in
mask
.
split
(
","
):
try
:
start
,
finish
=
token
.
split
(
"-"
)
if
int
(
start
)
>
int
(
finish
):
raise
IndexError
(
"Invalid Indexes for cpu ranges"
)
for
cpu
in
range
(
int
(
start
),
int
(
finish
)
+
1
):
result
.
append
(
cpu
)
except
ValueError
:
result
.
append
(
int
(
token
))
return
set
(
result
)
def
_get_default_affinity
()
->
set
[
int
]:
"""Get the set of CPUs the process is allowed to run on."""
if
hasattr
(
os
,
"sched_getaffinity"
):
return
os
.
sched_getaffinity
(
0
)
# macOS does not support sched_getaffinity; fall back to cpu_count
cpu_count
=
os
.
cpu_count
()
or
1
return
set
(
range
(
cpu_count
))
def
_get_cpu_topology_json
()
->
bytes
:
"""Get CPU topology as JSON.
On Linux this uses ``lscpu -Je``. On other platforms (e.g. macOS) we
synthesize a simple topology where every logical CPU is its own core
on NUMA node 0, which is sufficient for the OMP place-list builder.
"""
if
platform
.
system
()
==
"Linux"
:
return
subprocess
.
run
([
"lscpu"
,
"-Je"
],
check
=
True
,
capture_output
=
True
).
stdout
# Fallback for non-Linux (macOS, etc.)
cpu_count
=
os
.
cpu_count
()
or
1
cpus
=
[]
for
i
in
range
(
cpu_count
):
cpus
.
append
({
"cpu"
:
str
(
i
),
"core"
:
str
(
i
),
"node"
:
"0"
})
return
json
.
dumps
({
"cpus"
:
cpus
}).
encode
()
def
enumerate_resources
(
resource_map
,
mask
=
None
,
allowed
=
None
):
"""Enumerate system resources"""
if
allowed
is
None
:
allowed
=
_get_default_affinity
()
if
mask
is
not
None
:
allowed
=
allowed
&
mask
try
:
allowed_nodes
=
parse_mask
(
os
.
environ
[
"CPU_VISIBLE_MEMORY_NODES"
])
except
KeyError
:
allowed_nodes
=
None
lscpu
:
dict
[
str
,
dict
]
=
{
"cpus"
:
{},
"cores"
:
{},
"nodes"
:
{}}
for
cpu
in
resource_map
[
"cpus"
]:
cpunum
=
int
(
cpu
[
"cpu"
])
if
(
cpunum
in
allowed
and
cpunum
>=
0
and
(
allowed_nodes
is
None
or
_int
(
cpu
[
"node"
])
in
allowed_nodes
)
):
lscpu
[
"cpus"
][
cpunum
]
=
[
cpu
]
core
=
_int
(
cpu
[
"core"
])
if
lscpu
[
"cores"
].
get
(
core
,
None
)
is
None
:
lscpu
[
"cores"
][
core
]
=
[
cpu
]
else
:
lscpu
[
"cores"
][
core
].
append
(
cpu
)
node
=
_int
(
cpu
[
"node"
])
if
lscpu
[
"nodes"
].
get
(
node
,
None
)
is
None
:
lscpu
[
"nodes"
][
node
]
=
[
cpu
]
else
:
lscpu
[
"nodes"
][
node
].
append
(
cpu
)
return
lscpu
def
produce_cpu_list
(
cpus
,
smt
=
1
):
"""Produce a CPU list with/without SMT pairs - main cpu list case"""
mask
:
list
[
int
]
=
[]
for
key
,
value
in
cpus
.
items
():
exists
=
0
for
cpu
in
mask
:
if
cpu
==
value
[
0
][
"core"
]:
exists
+=
1
break
if
exists
<
smt
:
mask
.
append
(
int
(
key
))
return
{
"mask"
:
set
(
mask
),
"available"
:
True
}
def
produce_cpu_sublist
(
scpus
,
smt
=
1
):
"""Produce a CPU list with/without SMT pairs - resource leaf case"""
cpu_list
:
list
[
dict
]
=
[]
for
value
in
scpus
:
exists
=
0
for
cpu
in
cpu_list
:
if
int
(
cpu
[
"core"
])
==
int
(
value
[
"core"
]):
exists
+=
1
break
if
exists
<
smt
:
cpu_list
.
append
(
value
)
mask
=
[]
for
cpu
in
cpu_list
:
mask
.
append
(
int
(
cpu
[
"cpu"
]))
return
{
"mask"
:
set
(
mask
),
"available"
:
True
}
def
create_omp_places
(
resources
,
strategy
,
smt
=
True
):
"""Parse CPU topology and generate possible CPU masks"""
omp_places
=
[]
if
strategy
==
"all"
:
omp_places
.
append
(
produce_cpu_list
(
resources
[
"cpus"
],
smt
))
elif
strategy
==
"cores"
:
for
value
in
resources
[
"cores"
].
values
():
omp_places
.
append
(
produce_cpu_sublist
(
value
,
smt
))
elif
strategy
==
"nodes"
:
for
value
in
resources
[
"nodes"
].
values
():
omp_places
.
append
(
produce_cpu_sublist
(
value
,
smt
))
else
:
raise
NotImplementedError
(
"Unknown strategy"
)
return
omp_places
# pylint: disable=too-few-public-methods
class
OMPProcessManager
:
class
OMPProcessManager
:
"""OMP aware wrapper to run mp Process()"""
def
__init__
(
self
,
config
:
"VllmConfig"
):
if
not
current_platform
.
is_cpu
():
def
__init__
(
self
,
strategy
=
"nodes"
,
smt
=
1
,
mock
=
None
,
affinity
=
None
):
return
self
.
strategy
=
strategy
self
.
smt
=
smt
self
.
local_world_size
=
config
.
parallel_config
.
local_world_size
self
.
omp_places
=
[]
self
.
local_dp_rank
=
config
.
parallel_config
.
data_parallel_rank_local
vllm_mask
=
os
.
environ
.
get
(
"VLLM_CPU_OMP_THREADS_BIND"
,
None
)
# This is a bit tricky because the internal DP size
self
.
setup_omp
=
vllm_mask
!=
"nobind"
# is always 1 for non-MoE models
if
self
.
setup_omp
:
self
.
internal_dp_size
=
config
.
parallel_config
.
_api_process_count
omp_places
=
[]
if
vllm_mask
is
not
None
:
self
.
simulate_multi_node
=
os
.
environ
.
get
(
"VLLM_CPU_SIM_MULTI_NUMA"
,
"0"
)
!=
"0"
masks
=
[]
ld_preload_str
=
os
.
getenv
(
"LD_PRELOAD"
,
""
)
for
spec
in
vllm_mask
.
split
(
"|"
):
self
.
use_iomp
=
"libiomp"
in
ld_preload_str
or
"libomp"
in
ld_preload_str
masks
.
append
(
parse_mask
(
spec
))
self
.
use_gomp
=
"libgomp"
in
ld_preload_str
assert
not
(
self
.
use_iomp
and
self
.
use_gomp
)
# at least reserve 1/local_world_size(for ARM) core for scheduler
# proc as always use MP executor
# TODO: make scheduler proc sleep when idle
self
.
reserve_cpu_num
=
(
self
.
local_world_size
if
current_platform
.
get_cpu_architecture
()
==
CpuArchEnum
.
ARM
else
1
)
# reserve at one more core for nixl_connector under p/d case
if
config
.
kv_transfer_config
:
self
.
reserve_cpu_num
+=
1
if
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
is
not
None
:
if
self
.
reserve_cpu_num
>
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
:
msg
=
(
f
"VLLM_CPU_NUM_OF_RESERVED_CPU is less than "
"the minimum requirement"
f
":
{
self
.
reserve_cpu_num
}
cores"
)
logger
.
warning
(
msg
=
msg
)
self
.
reserve_cpu_num
=
envs
.
VLLM_CPU_NUM_OF_RESERVED_CPU
self
.
_parse_omp_threads_bind_env
()
assert
not
self
.
simulate_multi_node
or
self
.
auto_setup
@
contextmanager
def
configure_omp_envs
(
self
,
rank
:
int
,
local_rank
:
int
):
if
not
current_platform
.
is_cpu
()
or
self
.
skip_setup
:
yield
return
envs_dict
=
{}
cpu_list
=
[
str
(
i
)
for
i
in
self
.
cpu_lists
[
local_rank
]]
envs_dict
[
"OMP_NUM_THREADS"
]
=
str
(
len
(
cpu_list
))
if
self
.
use_iomp
:
# set IOMP envs
cpu_list_str
=
","
.
join
(
cpu_list
)
envs_dict
[
"KMP_AFFINITY"
]
=
(
f
"granularity=fine,explicit,proclist=[
{
cpu_list_str
}
]"
)
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
envs_dict
[
"KMP_BLOCKTIME"
]
=
"1"
# Prevents the CPU to run into low performance state
envs_dict
[
"KMP_TPAUSE"
]
=
"0"
# Provides fine granularity parallelism
envs_dict
[
"KMP_FORKJOIN_BARRIER_PATTERN"
]
=
"dist,dist"
envs_dict
[
"KMP_PLAIN_BARRIER_PATTERN"
]
=
"dist,dist"
envs_dict
[
"KMP_REDUCTION_BARRIER_PATTERN"
]
=
"dist,dist"
elif
self
.
use_gomp
:
# set GOMP envs
# likes '0 1 2 ...'
cpu_list_str
=
" "
.
join
(
cpu_list
)
envs_dict
[
"GOMP_CPU_AFFINITY"
]
=
cpu_list_str
else
:
# set OMP envs
# likes '{0,1,2,...}'
cpu_list_str
=
","
.
join
(
cpu_list
)
envs_dict
[
"OMP_PLACES"
]
=
f
"{{
{
cpu_list_str
}
}}"
envs_dict
[
"OMP_PROC_BIND"
]
=
"true"
# backup envs
old_envs_dict
=
{}
for
k
in
envs_dict
:
old_envs_dict
[
k
]
=
os
.
environ
.
get
(
k
)
try
:
# set envs
for
k
,
v
in
envs_dict
.
items
():
os
.
environ
[
k
]
=
v
yield
finally
:
# restore old envs
for
k
,
v
in
old_envs_dict
.
items
():
# type: ignore
if
v
is
None
:
os
.
environ
.
pop
(
k
,
None
)
else
:
os
.
environ
[
k
]
=
v
def
_parse_omp_threads_bind_env
(
self
):
vllm_mask
=
envs
.
VLLM_CPU_OMP_THREADS_BIND
self
.
skip_setup
=
vllm_mask
==
"nobind"
self
.
auto_setup
=
vllm_mask
==
"auto"
self
.
reserved_cpu_list
=
[]
self
.
cpu_lists
=
[]
if
self
.
auto_setup
:
# auto generate CPU lists
cpu_arch
=
current_platform
.
get_cpu_architecture
()
if
cpu_arch
==
CpuArchEnum
.
POWERPC
:
# For POWERPC SMT-8/4/2
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
[
cpu
for
cpu
in
cpus
if
cpu
.
id
%
8
<
4
]
)
elif
cpu_arch
in
(
CpuArchEnum
.
X86
,
CpuArchEnum
.
S390X
):
# For x86/S390X SMT-2, use 1 logical CPU per physical core
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
cpus
[
-
1
:]
)
elif
cpu_arch
==
CpuArchEnum
.
ARM
:
# For AArch64, no SMT, use all logical CPU
cpu_list
,
reserve_list
=
self
.
_get_autobind_cpu_ids
(
lambda
cpus
:
cpus
)
else
:
else
:
masks
=
[
None
]
cpu_list
,
reserve_list
=
[],
[]
if
mock
is
None
:
raise
RuntimeError
(
f
"
{
cpu_arch
}
doesn't support auto CPU binding."
)
data
=
_get_cpu_topology_json
()
for
item
in
cpu_list
:
self
.
cpu_lists
.
append
([
x
.
id
for
x
in
item
])
self
.
reserved_cpu_list
=
[
x
.
id
for
x
in
reserve_list
]
elif
not
self
.
skip_setup
:
# user defined CPU lists
omp_cpuids_list
=
vllm_mask
.
split
(
"|"
)
if
self
.
local_dp_rank
is
not
None
:
local_dp_rank
=
self
.
local_dp_rank
world_size
=
self
.
local_world_size
# Rank mapping [DP, PP, TP]
omp_cpuids_list
=
omp_cpuids_list
[
local_dp_rank
*
world_size
:
(
local_dp_rank
+
1
)
*
world_size
]
assert
len
(
omp_cpuids_list
)
==
self
.
local_world_size
,
(
"Given "
f
"number of CPU id list
{
omp_cpuids_list
}
doesn't match "
f
"local world size
{
self
.
local_world_size
}
."
)
# parse CPU list strings like "5,2-4" to [5, 2, 3, 4]
self
.
cpu_lists
=
[
cr_utils
.
parse_id_list
(
s
)
for
s
in
omp_cpuids_list
]
else
:
# skip
self
.
cpu_lists
=
[]
msg
=
"OpenMP thread binding info:
\n
"
for
i
in
range
(
self
.
local_world_size
):
msg
+=
f
"
\t
local_rank=
{
i
}
, core ids=
{
self
.
cpu_lists
[
i
]
}
\n
"
msg
+=
f
"
\t
reserved_cpus=
{
self
.
reserved_cpu_list
}
"
logger
.
info
(
msg
)
def
_get_autobind_cpu_ids
(
self
,
cpu_selector
:
Callable
[[
list
[
LogicalCPUInfo
]],
list
[
LogicalCPUInfo
]]
)
->
tuple
[
list
[
list
[
LogicalCPUInfo
]],
list
[
LogicalCPUInfo
]]:
"""
Return CPU ids to bind based on NUMA nodes, and CPU ids reserved for
other processes.
Currently for rank N, only CPU ids on the N-th node in available NUMA
node list will be selected.
Args:
cpu_selector: a callable object to select CPUs from a CPU list
of a physical core. The input is a LogicalCPUInfo list contains
logical CPUs of a physical CPU, sorted by the LogicalCPUInfo.id.
A selected LogicalCPUInfo list should be returned.
"""
# this memory node list has been sliced for DP offset
allowed_numa_nodes
=
cr_utils
.
get_visible_memory_node
()
logical_cpu_list
=
cr_utils
.
get_allowed_cpu_list
()
local_world_size
=
self
.
local_world_size
assert
(
len
(
allowed_numa_nodes
)
>=
local_world_size
or
self
.
simulate_multi_node
),
(
f
"Not enough allowed NUMA nodes to bind threads of "
f
"
{
local_world_size
}
local CPUWorkers. "
f
"Allowed NUMA nodes are
{
allowed_numa_nodes
}
. "
"Please try to bind threads manually or decrease DP/TP/PP."
)
# Generate OMP CPU list for each rank
cpu_lists_of_ranks
=
[]
reserved_cpu_list
=
[]
total_cpu_num
=
0
for
local_rank
in
range
(
self
.
local_world_size
):
if
not
self
.
simulate_multi_node
:
selected_numa_node
=
allowed_numa_nodes
[
local_rank
]
selected_logical_cpu_list
=
[
x
for
x
in
logical_cpu_list
if
x
.
numa_node
==
selected_numa_node
]
else
:
else
:
with
open
(
mock
,
mode
=
"rb"
)
as
jf
:
world_size_across_dp
=
self
.
local_world_size
*
self
.
internal_dp_size
data
=
jf
.
read
()
assert
len
(
logical_cpu_list
)
>=
world_size_across_dp
lscpu
=
json
.
loads
(
data
)
selected_logical_cpu_list
=
sorted
(
for
mask
in
masks
:
logical_cpu_list
,
key
=
lambda
x
:
x
.
numa_node
resources
=
enumerate_resources
(
lscpu
,
mask
,
affinity
)
)
omp_places
.
extend
(
create_omp_places
(
resources
,
strategy
,
smt
))
sim_cpu_num_per_node
=
(
self
.
omp_places
=
sorted
(
len
(
selected_logical_cpu_list
)
//
world_size_across_dp
omp_places
,
)
key
=
lambda
p
:
"{:04d}-{:04d}"
.
format
(
len
(
p
[
"mask"
]),
max
(
p
[
"mask"
])),
assert
self
.
local_dp_rank
is
not
None
reverse
=
True
,
start_idx
=
(
local_rank
+
self
.
local_world_size
*
self
.
local_dp_rank
)
*
sim_cpu_num_per_node
selected_logical_cpu_list
=
selected_logical_cpu_list
[
start_idx
:
(
start_idx
+
sim_cpu_num_per_node
)
]
# Select logical CPUs on same physical cores via cpu_selector
core_to_cpus
:
dict
[
int
,
list
[
LogicalCPUInfo
]]
=
{}
for
cpu_info
in
selected_logical_cpu_list
:
if
cpu_info
.
physical_core
not
in
core_to_cpus
:
core_to_cpus
[
cpu_info
.
physical_core
]
=
[]
core_to_cpus
[
cpu_info
.
physical_core
].
append
(
cpu_info
)
selected_logical_cpu_list
=
[]
for
cpu_list
in
core_to_cpus
.
values
():
cpu_list
=
sorted
(
cpu_list
,
key
=
lambda
x
:
x
.
id
)
selected_logical_cpu_list
.
extend
(
cpu_selector
(
cpu_list
))
# sort selected cores based on core id
selected_logical_cpu_list
=
sorted
(
selected_logical_cpu_list
,
key
=
lambda
x
:
x
.
id
)
)
def
run
(
self
,
what
,
*
args
,
**
kwargs
):
cpu_lists_of_ranks
.
append
(
selected_logical_cpu_list
)
"""Run arg with correct OMP environment"""
total_cpu_num
+=
len
(
selected_logical_cpu_list
)
if
self
.
setup_omp
:
for
place
in
self
.
omp_places
:
# Reserve CPUs for other processes
if
place
[
"available"
]:
if
total_cpu_num
<=
self
.
reserve_cpu_num
:
reserve
=
int
(
os
.
environ
.
get
(
"VLLM_CPU_NUM_OF_RESERVED_CPU"
,
0
))
logger
.
warning
(
place
[
"available"
]
=
False
"Selected CPU core number (%s) "
# pylint: disable=consider-using-f-string
"should be greater than reserved CPU core "
os
.
environ
[
"OMP_PLACES"
]
=
"{}"
.
format
(
place
[
"mask"
])
"number (%s)."
,
os
.
environ
[
"OMP_NUM_THREADS"
]
=
"{}"
.
format
(
total_cpu_num
,
len
(
place
[
"mask"
])
-
reserve
self
.
reserve_cpu_num
,
)
)
os
.
environ
[
"OMP_PROC_BIND"
]
=
"TRUE"
return
cpu_lists_of_ranks
,
[]
return
what
(
*
args
,
**
kwargs
)
raise
IndexError
(
"Out of OMP places"
)
reserve_num_per_rank
=
[
return
what
(
*
args
,
**
kwargs
)
self
.
reserve_cpu_num
//
self
.
local_world_size
]
*
self
.
local_world_size
# last rank first
for
i
in
range
(
self
.
local_world_size
-
1
,
self
.
local_world_size
-
1
-
self
.
reserve_cpu_num
%
self
.
local_world_size
,
-
1
,
):
reserve_num_per_rank
[
i
]
+=
1
for
i
in
range
(
self
.
local_world_size
):
num
=
reserve_num_per_rank
[
i
]
if
num
>
0
:
reserved_cpu_list
.
extend
(
cpu_lists_of_ranks
[
i
][
-
num
:])
cpu_lists_of_ranks
[
i
]
=
cpu_lists_of_ranks
[
i
][:
-
num
]
return
cpu_lists_of_ranks
,
reserved_cpu_list
vllm/v1/executor/multiproc_executor.py
View file @
d02421a7
...
@@ -51,6 +51,7 @@ from vllm.utils.network_utils import (
...
@@ -51,6 +51,7 @@ from vllm.utils.network_utils import (
get_loopback_ip
,
get_loopback_ip
,
get_open_port
,
get_open_port
,
)
)
from
vllm.utils.ompmultiprocessing
import
OMPProcessManager
from
vllm.utils.system_utils
import
(
from
vllm.utils.system_utils
import
(
_maybe_force_spawn
,
_maybe_force_spawn
,
decorate_logs
,
decorate_logs
,
...
@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor):
...
@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor):
[]
if
context
.
get_start_method
()
==
"fork"
else
None
[]
if
context
.
get_start_method
()
==
"fork"
else
None
)
)
# For CPU backend only, to setup OpenMP threads affinity
cpu_omp_manager
=
OMPProcessManager
(
self
.
vllm_config
)
for
local_rank
in
range
(
self
.
local_world_size
):
for
local_rank
in
range
(
self
.
local_world_size
):
global_rank
=
global_start_rank
+
local_rank
global_rank
=
global_start_rank
+
local_rank
is_driver_worker
=
self
.
_is_driver_worker
(
global_rank
)
is_driver_worker
=
self
.
_is_driver_worker
(
global_rank
)
if
current_platform
.
is_cpu
():
with
cpu_omp_manager
.
configure_omp_envs
(
om
=
current_platform
.
get_omp_manager
()
rank
=
global_rank
,
local_rank
=
local_rank
logger
.
info
(
"Configured OMP PLACES %s"
,
str
(
om
.
omp_places
))
):
unready_worker_handle
=
om
.
run
(
WorkerProc
.
make_worker_process
,
vllm_config
=
self
.
vllm_config
,
local_rank
=
local_rank
,
rank
=
global_rank
,
distributed_init_method
=
distributed_init_method
,
input_shm_handle
=
scheduler_output_handle
,
shared_worker_lock
=
shared_worker_lock
,
is_driver_worker
=
is_driver_worker
,
inherited_fds
=
inherited_fds
,
)
else
:
unready_worker_handle
=
WorkerProc
.
make_worker_process
(
unready_worker_handle
=
WorkerProc
.
make_worker_process
(
vllm_config
=
self
.
vllm_config
,
vllm_config
=
self
.
vllm_config
,
local_rank
=
local_rank
,
local_rank
=
local_rank
,
...
...
vllm/v1/worker/cpu_model_runner.py
View file @
d02421a7
...
@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner):
...
@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner):
logger
.
info
(
"Warming up model for the compilation..."
)
logger
.
info
(
"Warming up model for the compilation..."
)
# Only generate graph for the generic shape
# Only generate graph for the generic shape
with
_set_global_compilation_settings
(
self
.
vllm_config
):
with
_set_global_compilation_settings
(
self
.
vllm_config
):
self
.
_dummy_run
(
self
.
profile_run
()
min
(
max
(
16
,
self
.
max_num_reqs
),
self
.
scheduler_config
.
max_num_batched_tokens
,
)
)
# Warm up drafter for speculative decoding
if
self
.
speculative_config
and
(
self
.
speculative_config
.
uses_draft_model
()):
from
vllm.v1.spec_decode.draft_model
import
DraftModelProposer
if
isinstance
(
self
.
drafter
,
(
DraftModelProposer
)):
logger
.
info
(
"Warming up drafter model..."
)
self
.
drafter
.
dummy_run
(
max
(
16
,
self
.
max_num_reqs
))
logger
.
info
(
"Warming up done."
)
logger
.
info
(
"Warming up done."
)
def
initialize_kv_cache
(
def
initialize_kv_cache
(
...
...
vllm/v1/worker/cpu_worker.py
View file @
d02421a7
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
os
import
os
import
sys
import
sys
from
typing
import
Any
from
typing
import
Any
import
psutil
import
torch
import
torch
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.profiler.wrapper
import
TorchProfilerWrapper
from
vllm.profiler.wrapper
import
TorchProfilerWrapper
from
vllm.utils.cpu_resource_utils
import
(
get_allowed_cpu_list
,
get_memory_node_info
,
get_visible_memory_node
,
)
from
vllm.utils.mem_utils
import
format_gib
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.utils.torch_utils
import
set_random_seed
from
vllm.v1.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.v1.worker.cpu_model_runner
import
CPUModelRunner
from
vllm.v1.worker.gpu_worker
import
Worker
,
init_worker_distributed_environment
from
vllm.v1.worker.gpu_worker
import
Worker
,
init_worker_distributed_environment
...
@@ -27,6 +35,46 @@ class CPUWorker(Worker):
...
@@ -27,6 +35,46 @@ class CPUWorker(Worker):
distributed_init_method
:
str
,
distributed_init_method
:
str
,
is_driver_worker
:
bool
=
False
,
is_driver_worker
:
bool
=
False
,
):
):
# TODO: use numactl for process setup
# TODO: optimize for `interleaved` policy
# Bind memory node
allowed_memory_nodes
=
get_visible_memory_node
()
allowed_cpu_list
=
get_allowed_cpu_list
()
cpu_core
=
allowed_cpu_list
[
0
]
# TODO: some CI hosts are not correctly set, change to assertion
# after fix
if
cpu_core
.
numa_node
not
in
allowed_memory_nodes
:
logger
.
warning
(
"Node %s is not in available memory nodes %s."
,
cpu_core
.
numa_node
,
allowed_memory_nodes
,
)
torch
.
ops
.
_C
.
init_cpu_memory_env
([
cpu_core
.
numa_node
])
memory_status
=
get_memory_node_info
(
cpu_core
.
numa_node
)
memory_fraction
=
vllm_config
.
cache_config
.
gpu_memory_utilization
self
.
requested_cpu_memory
=
math
.
ceil
(
memory_status
.
total_memory
*
memory_fraction
)
available_memory
=
memory_status
.
available_memory
if
(
vllm_config
.
cache_config
.
kv_cache_memory_bytes
is
None
and
self
.
requested_cpu_memory
>
available_memory
):
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on startup "
f
"is less than desired CPU memory utilization "
f
"(
{
vllm_config
.
cache_config
.
gpu_memory_utilization
}
, "
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB). "
"Decrease --gpu-memory-utilization"
f
" or reduce CPU memory used by other processes."
)
super
().
__init__
(
super
().
__init__
(
vllm_config
,
vllm_config
,
local_rank
,
local_rank
,
...
@@ -103,13 +151,69 @@ class CPUWorker(Worker):
...
@@ -103,13 +151,69 @@ class CPUWorker(Worker):
pass
pass
def
determine_available_memory
(
self
)
->
int
:
def
determine_available_memory
(
self
)
->
int
:
return
self
.
cache_config
.
cpu_kvcache_space_bytes
or
0
self
.
model_runner
.
warming_up_model
()
allowed_cpu_list
=
get_allowed_cpu_list
()
cpu_core
=
allowed_cpu_list
[
0
]
memory_status
=
get_memory_node_info
(
cpu_core
.
numa_node
)
available_memory
=
memory_status
.
available_memory
explicit_kv_cache_size
=
self
.
cache_config
.
kv_cache_memory_bytes
kv_cache_size
=
None
msg
=
None
if
explicit_kv_cache_size
is
not
None
:
if
explicit_kv_cache_size
>
available_memory
:
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on kv cache"
f
" allocation is less than requested memory for kv "
f
"(
{
format_gib
(
explicit_kv_cache_size
)
}
GiB). "
"Decrease --kv-cache-memory-bytes, VLLM_CPU_KVCACHE_SPACE, "
"or reduce CPU memory used by other processes."
)
kv_cache_size
=
explicit_kv_cache_size
msg
=
(
f
"Explicitly set (
{
format_gib
(
kv_cache_size
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
) GiB for KV cache "
f
"on node
{
cpu_core
.
numa_node
}
."
)
else
:
consumed_memory
=
psutil
.
Process
(
os
.
getpid
()).
memory_info
().
rss
requested_memory_for_kv
=
int
(
self
.
requested_cpu_memory
-
consumed_memory
)
if
(
requested_memory_for_kv
<=
0
or
requested_memory_for_kv
>
available_memory
):
raise
ValueError
(
f
"Available memory on node
{
cpu_core
.
numa_node
}
"
f
"(
{
format_gib
(
available_memory
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
GiB) on kv cache"
f
" allocation is less than requested memory for kv "
f
"(
{
format_gib
(
requested_memory_for_kv
)
}
/"
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB). "
"Reduce CPU memory used by other processes."
)
kv_cache_size
=
requested_memory_for_kv
msg
=
(
f
"Auto set (
{
format_gib
(
kv_cache_size
)
}
/"
f
"
{
format_gib
(
memory_status
.
total_memory
)
}
) GiB for KV cache "
f
"on node
{
cpu_core
.
numa_node
}
, with "
f
"
{
format_gib
(
self
.
requested_cpu_memory
)
}
GiB requested memory"
f
" for the worker.
{
format_gib
(
consumed_memory
)
}
GiB"
f
" memory was consumed by non-kv usages."
)
logger
.
info
(
msg
)
return
kv_cache_size
def
compile_or_warm_up_model
(
self
)
->
CompilationTimes
:
def
compile_or_warm_up_model
(
self
)
->
CompilationTimes
:
# Reset the seed to ensure that the random state is not affected by
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
set_random_seed
(
self
.
model_config
.
seed
)
self
.
model_runner
.
warming_up_model
()
# Note: the model has been compiled in determine_available_memory
()
return
CompilationTimes
(
return
CompilationTimes
(
language_model
=
self
.
compilation_config
.
compilation_time
,
language_model
=
self
.
compilation_config
.
compilation_time
,
encoder
=
self
.
compilation_config
.
encoder_compilation_time
,
encoder
=
self
.
compilation_config
.
encoder_compilation_time
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment