Unverified Commit d02421a7 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[CPU] Refactor CPU affinity and memory management (#39781)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
parent b1dc87a0
...@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then ...@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
exit 1 exit 1
fi fi
#echo "--- DP+TP" echo "--- DP+TP"
#vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 & vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
#server_pid=$! server_pid=$!
#timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1 timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
#vllm bench serve \ vllm bench serve \
# --backend vllm \ --backend vllm \
# --dataset-name random \ --dataset-name random \
# --model meta-llama/Llama-3.2-3B-Instruct \ --model meta-llama/Llama-3.2-3B-Instruct \
# --num-prompts 20 \ --num-prompts 20 \
# --result-dir ./test_results \ --result-dir ./test_results \
# --result-filename dp_pp.json \ --result-filename dp_pp.json \
# --save-result \ --save-result \
# --endpoint /v1/completions --endpoint /v1/completions
#kill -s SIGTERM $server_pid; wait $server_pid || true kill -s SIGTERM $server_pid; wait $server_pid || true
#failed_req=$(jq '.failed' ./test_results/dp_pp.json) failed_req=$(jq '.failed' ./test_results/dp_pp.json)
#if [ "$failed_req" -ne 0 ]; then if [ "$failed_req" -ne 0 ]; then
# echo "Some requests were failed!" echo "Some requests were failed!"
# exit 1 exit 1
#fi fi
...@@ -45,6 +45,7 @@ jobs: ...@@ -45,6 +45,7 @@ jobs:
- name: Smoke test vllm serve - name: Smoke test vllm serve
run: | run: |
# Start server in background # Start server in background
VLLM_CPU_KVCACHE_SPACE=1 \
vllm serve Qwen/Qwen3-0.6B \ vllm serve Qwen/Qwen3-0.6B \
--max-model-len=2K \ --max-model-len=2K \
--load-format=dummy \ --load-format=dummy \
......
...@@ -30,6 +30,21 @@ else() ...@@ -30,6 +30,21 @@ else()
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
"-fopenmp" "-fopenmp"
"-DVLLM_CPU_EXTENSION") "-DVLLM_CPU_EXTENSION")
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
find_library(OPEN_MP
NAMES gomp
PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if (OPEN_MP)
set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
endif()
endif() endif()
if (NOT MACOSX_FOUND) if (NOT MACOSX_FOUND)
...@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND ...@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
if(NOT NPROC) if(NOT NPROC)
set(NPROC 4) set(NPROC 4)
endif() endif()
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
find_library(OPEN_MP
NAMES gomp
PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
NO_DEFAULT_PATH
REQUIRED
)
# Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
if (OPEN_MP)
set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
endif()
# Fetch and populate ACL # Fetch and populate ACL
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}") if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
......
...@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc, ...@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc,
torch::Tensor slot_mapping, torch::Tensor slot_mapping,
const int64_t block_size); const int64_t block_size);
void init_cpu_memory_env(std::vector<int64_t> node_ids);
namespace cpu_utils { namespace cpu_utils {
void eagle_prepare_inputs_padded_kernel_impl( void eagle_prepare_inputs_padded_kernel_impl(
const torch::Tensor& cu_num_draft_tokens, const torch::Tensor& cu_num_draft_tokens,
...@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"block_size) -> ()", "block_size) -> ()",
&compute_slot_mapping_kernel_impl); &compute_slot_mapping_kernel_impl);
ops.def("init_cpu_memory_env(SymInt[] node_ids) -> ()", &init_cpu_memory_env);
// Speculative decoding kernels // Speculative decoding kernels
ops.def( ops.def(
"eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, " "eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, "
......
...@@ -13,13 +13,80 @@ ...@@ -13,13 +13,80 @@
#include "cpu/utils.hpp" #include "cpu/utils.hpp"
#ifdef VLLM_NUMA_DISABLED #ifdef VLLM_NUMA_DISABLED
std::string init_cpu_threads_env(const std::string& cpu_ids) { void init_cpu_memory_env(std::vector<int64_t> node_ids) {}
return std::string( #else
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has " void init_cpu_memory_env(std::vector<int64_t> node_ids) {
"no effect to setup thread affinity."); // Memory node binding
} if (numa_available() != -1) {
// Concatenate all node_ids into a single comma-separated string
if (!node_ids.empty()) {
std::string node_ids_str;
for (const int node_id : node_ids) {
if (!node_ids_str.empty()) {
node_ids_str += ",";
}
node_ids_str += std::to_string(node_id);
}
#endif bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
bitmask* src_mask = numa_get_mems_allowed();
int pid = getpid();
if (mask && src_mask) {
// move all existing pages to the specified numa node.
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
int page_num = numa_migrate_pages(pid, src_mask, mask);
if (page_num == -1) {
TORCH_WARN("numa_migrate_pages failed. errno: " +
std::to_string(errno));
}
// Restrict memory allocation to the selected NUMA node(s).
// Enhances memory locality for the threads bound to those NUMA CPUs.
if (node_ids.size() > 1) {
errno = 0;
numa_set_interleave_mask(mask);
if (errno != 0) {
TORCH_WARN("numa_set_interleave_mask failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using INTERLEAVE policy for memory "
"allocation across multiple NUMA nodes (nodes: " +
node_ids_str +
"). Memory allocations will be "
"interleaved across the specified NUMA nodes.");
}
} else {
errno = 0;
numa_set_membind(mask);
if (errno != 0) {
TORCH_WARN("numa_set_membind failed. errno: " +
std::to_string(errno));
} else {
TORCH_WARN(
"NUMA binding: Using MEMBIND policy for memory "
"allocation on the NUMA nodes (" +
node_ids_str +
"). Memory allocations will be "
"strictly bound to these NUMA nodes.");
}
}
numa_set_strict(1);
numa_free_nodemask(mask);
numa_free_nodemask(src_mask);
} else {
TORCH_WARN(
"numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
std::to_string(errno));
}
}
}
}
#endif // VLLM_NUMA_DISABLED
namespace cpu_utils { namespace cpu_utils {
ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) { ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {
......
...@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt && \ uv pip install -r requirements/lint.txt && \
uv pip install -r requirements/test/cpu.txt && \
pre-commit install --hook-type pre-commit --hook-type commit-msg pre-commit install --hook-type pre-commit --hook-type commit-msg
ENTRYPOINT ["bash"] ENTRYPOINT ["bash"]
......
...@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [ ...@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [
), ),
pytest.param( pytest.param(
"openai-community/gpt2", # gpt2 "openai-community/gpt2", # gpt2
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model],
), ),
pytest.param("Milos/slovak-gpt-j-405M"), # gptj pytest.param("Milos/slovak-gpt-j-405M"), # gptj
pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode
...@@ -143,11 +143,6 @@ def test_models( ...@@ -143,11 +143,6 @@ def test_models(
# in parts of the operators # in parts of the operators
pytest.skip(f"Skipping '{model}' model test with AITER kernel.") pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
if current_platform.is_cpu() and model in ("openai-community/gpt2",):
# These models are sensitive to the rounding error
# Fuse ops to reduce rounding
monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
with hf_runner(model) as hf_model: with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit( hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs example_prompts, max_tokens, num_logprobs
......
...@@ -101,8 +101,6 @@ class CacheConfig: ...@@ -101,8 +101,6 @@ class CacheConfig:
kv_cache_dtype_skip_layers: list[str] = field(default_factory=list) kv_cache_dtype_skip_layers: list[str] = field(default_factory=list)
"""Layer patterns to skip KV cache quantization. Accepts layer indices """Layer patterns to skip KV cache quantization. Accepts layer indices
(e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window').""" (e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
cpu_kvcache_space_bytes: int | None = None
"""(CPU backend only) CPU key-value cache space."""
mamba_page_size_padded: int | None = None mamba_page_size_padded: int | None = None
""" Optional override for mamba page size; used by hybrid mamba/attention """ Optional override for mamba page size; used by hybrid mamba/attention
models to ensure exact alignment with attention page size.""" models to ensure exact alignment with attention page size."""
...@@ -183,7 +181,6 @@ class CacheConfig: ...@@ -183,7 +181,6 @@ class CacheConfig:
"num_gpu_blocks_override", "num_gpu_blocks_override",
"enable_prefix_caching", "enable_prefix_caching",
"prefix_caching_hash_algo", "prefix_caching_hash_algo",
"cpu_kvcache_space_bytes",
"mamba_page_size_padded", "mamba_page_size_padded",
"user_specified_block_size", "user_specified_block_size",
"user_specified_mamba_block_size", "user_specified_mamba_block_size",
......
...@@ -6,15 +6,16 @@ import os ...@@ -6,15 +6,16 @@ import os
import platform import platform
import subprocess import subprocess
import sys import sys
from dataclasses import dataclass
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import psutil
import torch import torch
from vllm import envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.ompmultiprocessing import OMPProcessManager from vllm.utils.cpu_resource_utils import (
DEVICE_CONTROL_ENV_VAR,
get_memory_node_info,
)
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.torch_utils import is_quantized_kv_cache from vllm.utils.torch_utils import is_quantized_kv_cache
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
...@@ -38,49 +39,13 @@ def get_max_threads(pid=0): ...@@ -38,49 +39,13 @@ def get_max_threads(pid=0):
raise NotImplementedError("Unsupported OS") raise NotImplementedError("Unsupported OS")
@dataclass
class LogicalCPUInfo:
id: int = -1
physical_core: int = -1
numa_node: int = -1
@classmethod
def _int(cls, value: str) -> int:
try:
int_value = int(value)
except Exception:
int_value = -1
return int_value
@staticmethod
def json_decoder(obj_dict: dict):
id = obj_dict.get("cpu")
physical_core = obj_dict.get("core")
numa_node = obj_dict.get("node")
if not (id is None or physical_core is None or numa_node is None):
return LogicalCPUInfo(
id=LogicalCPUInfo._int(id),
physical_core=LogicalCPUInfo._int(physical_core),
numa_node=LogicalCPUInfo._int(numa_node),
)
else:
return obj_dict
class CpuPlatform(Platform): class CpuPlatform(Platform):
_enum = PlatformEnum.CPU _enum = PlatformEnum.CPU
device_name: str = "cpu" device_name: str = "cpu"
device_type: str = "cpu" device_type: str = "cpu"
dispatch_key: str = "CPU" dispatch_key: str = "CPU"
dist_backend: str = "gloo" dist_backend: str = "gloo"
device_control_env_var = "CPU_VISIBLE_MEMORY_NODES" device_control_env_var = DEVICE_CONTROL_ENV_VAR
omp_process_manager = None
# Simultaneous Multithreading (SMT) level for OpenMP:
# 4 on PowerPC, 1 on non-PowerPC architectures
smt = 1
global_cpu_mask = None
simulate_numa = int(os.environ.get("_SIM_MULTI_NUMA", 0))
@property @property
def supported_dtypes(self) -> list[torch.dtype]: def supported_dtypes(self) -> list[torch.dtype]:
...@@ -123,29 +88,9 @@ class CpuPlatform(Platform): ...@@ -123,29 +88,9 @@ class CpuPlatform(Platform):
@classmethod @classmethod
def get_device_total_memory(cls, device_id: int = 0) -> int: def get_device_total_memory(cls, device_id: int = 0) -> int:
from vllm.utils.mem_constants import GiB_bytes meminfo = get_memory_node_info(device_id)
from vllm.utils.mem_utils import format_gib
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
node_dir = "/sys/devices/system/node"
if kv_cache_space is None:
nodes = (
[d for d in os.listdir(node_dir) if d.startswith("node")]
if os.path.exists(node_dir)
else []
)
num_numa_nodes = len(nodes) or 1
free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
DEFAULT_CPU_MEM_UTILIZATION = 0.5
kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
logger.warning_once(
"VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.",
format_gib(kv_cache_space),
)
else:
kv_cache_space *= GiB_bytes
return kv_cache_space return meminfo.total_memory
@classmethod @classmethod
def set_device(cls, device: torch.device) -> None: def set_device(cls, device: torch.device) -> None:
...@@ -180,6 +125,12 @@ class CpuPlatform(Platform): ...@@ -180,6 +125,12 @@ class CpuPlatform(Platform):
"otherwise the performance is not optimized." "otherwise the performance is not optimized."
) )
# Lagecy setting
env_key = "VLLM_CPU_KVCACHE_SPACE"
if env_key in os.environ and os.environ[env_key] != "":
kv_cache_space = int(os.environ[env_key])
cache_config.kv_cache_memory_bytes = kv_cache_space * GiB_bytes
scheduler_config = vllm_config.scheduler_config scheduler_config = vllm_config.scheduler_config
# async scheduling is not required on CPU # async scheduling is not required on CPU
scheduler_config.async_scheduling = False scheduler_config.async_scheduling = False
...@@ -198,8 +149,6 @@ class CpuPlatform(Platform): ...@@ -198,8 +149,6 @@ class CpuPlatform(Platform):
) )
cache_config.cache_dtype = "auto" cache_config.cache_dtype = "auto"
cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
# OMP requires the MP executor to function correctly, UniProc is not # OMP requires the MP executor to function correctly, UniProc is not
# supported as it is not possible to set the OMP environment correctly # supported as it is not possible to set the OMP environment correctly
...@@ -278,21 +227,45 @@ class CpuPlatform(Platform): ...@@ -278,21 +227,45 @@ class CpuPlatform(Platform):
os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1" os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
ld_preload_str = os.getenv("LD_PRELOAD", "") ld_preload_str = os.getenv("LD_PRELOAD", "")
# Intel and CLANG OpenMP setting
if "libiomp5.so" in ld_preload_str or "libomp5" in ld_preload_str:
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
os.environ["KMP_BLOCKTIME"] = "1"
# Prevents the CPU to run into low performance state
os.environ["KMP_TPAUSE"] = "0"
# Provides fine granularity parallelism
os.environ["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
cpu_architecture = Platform.get_cpu_architecture() cpu_architecture = Platform.get_cpu_architecture()
if (
platform.system() == "Linux"
and cpu_architecture
in (CpuArchEnum.ARM, CpuArchEnum.POWERPC, CpuArchEnum.X86)
and not (
"libomp" in ld_preload_str
or "libgomp" in ld_preload_str
or "libiomp" in ld_preload_str
)
):
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
# one core will be properly utilized when we thread-bind
# See: https://github.com/vllm-project/vllm/issues/27369
# TODO: Remove once:
# https://github.com/pytorch/pytorch/issues/166087 is fixed
# We need to find the location of PyTorch's libgomp
torch_pkg = os.path.dirname(torch.__file__)
site_root = os.path.dirname(torch_pkg)
# Search both torch.libs and torch/lib - See:
# https://github.com/vllm-project/vllm/issues/30470
torch_libs_paths = [
os.path.join(site_root, "torch.libs"),
os.path.join(torch_pkg, "lib"),
]
pytorch_libgomp_so_candidates = []
for torch_libs in torch_libs_paths:
pytorch_libgomp_so_candidates.extend(
glob.glob(os.path.join(torch_libs, "libgomp*.so*"))
)
if pytorch_libgomp_so_candidates:
pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
if ld_preload_str:
ld_preload_str += ":"
ld_preload_str += pytorch_libgomp_so
os.environ["LD_PRELOAD"] = ld_preload_str
# LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce # LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
# memory allocation overhead # memory allocation overhead
if ( if (
...@@ -331,13 +304,6 @@ class CpuPlatform(Platform): ...@@ -331,13 +304,6 @@ class CpuPlatform(Platform):
vllm_config.model_config.max_model_len, vllm_config.model_config.max_model_len,
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS, vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
) )
# CI specific "quick" NUMA simulation - split all available CPUs
# into a fake NUMA topology
if os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", None) is not None:
os.environ["_SIM_MULTI_NUMA"] = str(
vllm_config.parallel_config.world_size
* vllm_config.parallel_config._api_process_count
)
@classmethod @classmethod
def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None: def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
...@@ -345,78 +311,6 @@ class CpuPlatform(Platform): ...@@ -345,78 +311,6 @@ class CpuPlatform(Platform):
# Move that logic here so block_size is chosen by the backend. # Move that logic here so block_size is chosen by the backend.
pass pass
@classmethod
def get_omp_manager(cls) -> OMPProcessManager:
# initialise the OMP resource management if need be and return the manager
if cls.omp_process_manager is None:
if cls.get_cpu_architecture() == CpuArchEnum.POWERPC:
cls.smt = 4
cls.omp_process_manager = OMPProcessManager(
affinity=cls.get_global_cpu_mask(), smt=cls.smt
)
# we need to fix up the topology returned by the OMP Manager for
# simulated NUMA environments in CI
if cls.simulate_numa > 0:
logger.info(
"Adjusting numa topology to resemble at least %d nodes",
int(cls.simulate_numa),
)
om = cls.omp_process_manager
while len(om.omp_places) < cls.simulate_numa:
new_omp_places = []
touched = False
for omp_place in om.omp_places:
if len(omp_place["mask"]) > 1:
touched = True
cpu_list = sorted(list(omp_place["mask"]))
new_omp_places.append(
{
"mask": set(cpu_list[0 : int(len(cpu_list) / 2)]),
"available": True,
}
)
new_omp_places.append(
{
"mask": set(cpu_list[int(len(cpu_list) / 2) :]),
"available": True,
}
)
if touched:
om.omp_places = new_omp_places
else:
raise ValueError(
"Cannot split the existing NUMA topology to match "
"simulation requirements"
)
return cls.omp_process_manager
@classmethod
def get_global_cpu_mask(cls) -> set[int]:
# get global cpu mask
if cls.global_cpu_mask is None:
if hasattr(os, "sched_getaffinity"):
cls.global_cpu_mask = os.sched_getaffinity(0)
else:
# macOS does not support sched_getaffinity
cpu_count = os.cpu_count() or 1
cls.global_cpu_mask = set(range(cpu_count))
return cls.global_cpu_mask
@classmethod
def reserve_cpus(cls, reserve: set[int]) -> bool:
# remove CPUs from global mask, for now there is no "release" mechanism
if cls.omp_process_manager is not None:
for place in cls.omp_process_manager.omp_places:
if not place["available"]:
return False
cls.global_cpu_mask = cls.get_global_cpu_mask() - reserve
# reinitialize OMP resource management
cls.omp_process_manager = OMPProcessManager(
affinity=cls.global_cpu_mask, smt=cls.smt
)
return True
@classmethod @classmethod
def discover_numa_topology(cls) -> list[list[int]]: def discover_numa_topology(cls) -> list[list[int]]:
""" """
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os
import platform
import subprocess
from dataclasses import dataclass
from functools import cache
import psutil
import regex as re
DEVICE_CONTROL_ENV_VAR = "CPU_VISIBLE_MEMORY_NODES"
@dataclass
class LogicalCPUInfo:
id: int = -1
physical_core: int = -1
numa_node: int = -1
@classmethod
def _int(cls, value: str) -> int:
try:
int_value = int(value)
except Exception:
int_value = -1
return int_value
@staticmethod
def json_decoder(obj_dict: dict):
id = obj_dict.get("cpu")
physical_core = obj_dict.get("core")
numa_node = obj_dict.get("node")
if not (id is None or physical_core is None or numa_node is None):
return LogicalCPUInfo(
id=LogicalCPUInfo._int(id),
physical_core=LogicalCPUInfo._int(physical_core),
numa_node=LogicalCPUInfo._int(numa_node),
)
else:
return obj_dict
@dataclass
class MemoryNodeInfo:
total_memory: int = -1
available_memory: int = -1
def get_memory_affinity(pid: int = 0) -> list[int]:
pid = os.getpid() if pid == 0 else pid
path = f"/proc/{pid}/status"
with open(path) as f:
for line in f:
if line.startswith("Mems_allowed_list:"):
# Extract the string part (e.g., "0-1,3")
raw_list = line.split(":")[1].strip()
return parse_id_list(raw_list)
return []
def parse_id_list(raw_str: str) -> list[int]:
"""Parses strings like '0-2,4,7-8' into [0, 1, 2, 4, 7, 8]"""
result: list[int] = []
if not raw_str:
return result
for part in raw_str.split(","):
if "-" in part:
start, end = map(int, part.split("-"))
result.extend(range(start, end + 1))
else:
result.append(int(part))
return sorted(list(set(result)))
def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
if platform.system() == "Darwin":
# MacOS has no memory node
return MemoryNodeInfo(
total_memory=psutil.virtual_memory().total,
available_memory=psutil.virtual_memory().available,
)
meminfo_path = f"/sys/devices/system/node/node{node_id}/meminfo"
if not os.path.exists(meminfo_path):
raise RuntimeError(f"{meminfo_path} doesn't exit.")
meminfo = {}
with open(meminfo_path) as f:
for line in f:
# Each line looks like: "Node 0 MemTotal: 97421888 kB"
parts = line.split()
key = parts[2].rstrip(":")
# convert to Bytes
value = int(parts[3]) * 1024
meminfo[key] = value
total_memory = meminfo["MemTotal"]
free_memory = meminfo["MemFree"]
active_file_memory = meminfo["Active(file)"]
inactive_file_memory = meminfo["Inactive(file)"]
reclaimable_memory = meminfo["SReclaimable"]
available_memory = (
free_memory + active_file_memory + inactive_file_memory + reclaimable_memory
)
return MemoryNodeInfo(
total_memory=total_memory,
available_memory=available_memory,
)
def get_allowed_cpu_list() -> list[LogicalCPUInfo]:
cpu_list = _get_cpu_list()
if platform.system() == "Darwin":
return cpu_list
global_allowed_cpu_id_list = os.sched_getaffinity(0)
logical_cpu_list = [x for x in cpu_list if x.id in global_allowed_cpu_id_list]
return logical_cpu_list
def get_visible_memory_node() -> list[int]:
if platform.system() == "Darwin":
return [0]
allowed_memory_node_list = get_memory_affinity()
env_key = DEVICE_CONTROL_ENV_VAR
if (
("VLLM_CPU_SIM_MULTI_NUMA" not in os.environ)
and env_key in os.environ
and os.environ[env_key] != ""
):
visible_nodes = [int(s) for s in os.environ[env_key].split(",")]
visible_nodes = [
node for node in visible_nodes if node in allowed_memory_node_list
]
return visible_nodes
return allowed_memory_node_list
@cache
def _get_cpu_list() -> list[LogicalCPUInfo]:
if platform.system() == "Darwin":
# For MacOS, no user-level CPU affinity and SMT, return all CPUs
cpu_count = os.cpu_count()
assert cpu_count
return [LogicalCPUInfo(i, i, 0) for i in range(cpu_count)]
lscpu_output = subprocess.check_output(
"lscpu -J -e=CPU,CORE,NODE", shell=True, text=True
)
# For platform without NUMA, replace '-' to '0'
lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output)
logical_cpu_list: list[LogicalCPUInfo] = json.loads(
lscpu_output, object_hook=LogicalCPUInfo.json_decoder
)["cpus"]
# Filter CPUs with invalid attributes
logical_cpu_list = [
x for x in logical_cpu_list if -1 not in (x.id, x.physical_core, x.numa_node)
]
return logical_cpu_list
...@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc ...@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc
Copyright (c) 2026 Cambridge Greys Ltd Copyright (c) 2026 Cambridge Greys Ltd
""" """
import json
import os import os
import platform from collections.abc import Callable
import subprocess from contextlib import contextmanager
from typing import TYPE_CHECKING
def _int(arg): import vllm.utils.cpu_resource_utils as cr_utils
"""Relaxed parsing of ints which handles a - instead of a number. from vllm import envs
The lscpu json may contain that for nodes in some cases. If that from vllm.logger import init_logger
is the case we parse it to zero from vllm.platforms import CpuArchEnum, current_platform
""" from vllm.utils.cpu_resource_utils import LogicalCPUInfo
try:
if int(arg) >= 0: if TYPE_CHECKING:
return int(arg) from vllm.config import VllmConfig
except ValueError:
pass logger = init_logger(__name__)
return 0
def parse_mask(mask):
"""Expand a X-Y,Z list"""
result = []
for token in mask.split(","):
try:
start, finish = token.split("-")
if int(start) > int(finish):
raise IndexError("Invalid Indexes for cpu ranges")
for cpu in range(int(start), int(finish) + 1):
result.append(cpu)
except ValueError:
result.append(int(token))
return set(result)
def _get_default_affinity() -> set[int]:
"""Get the set of CPUs the process is allowed to run on."""
if hasattr(os, "sched_getaffinity"):
return os.sched_getaffinity(0)
# macOS does not support sched_getaffinity; fall back to cpu_count
cpu_count = os.cpu_count() or 1
return set(range(cpu_count))
def _get_cpu_topology_json() -> bytes:
"""Get CPU topology as JSON.
On Linux this uses ``lscpu -Je``. On other platforms (e.g. macOS) we
synthesize a simple topology where every logical CPU is its own core
on NUMA node 0, which is sufficient for the OMP place-list builder.
"""
if platform.system() == "Linux":
return subprocess.run(["lscpu", "-Je"], check=True, capture_output=True).stdout
# Fallback for non-Linux (macOS, etc.)
cpu_count = os.cpu_count() or 1
cpus = []
for i in range(cpu_count):
cpus.append({"cpu": str(i), "core": str(i), "node": "0"})
return json.dumps({"cpus": cpus}).encode()
def enumerate_resources(resource_map, mask=None, allowed=None):
"""Enumerate system resources"""
if allowed is None:
allowed = _get_default_affinity()
if mask is not None:
allowed = allowed & mask
try:
allowed_nodes = parse_mask(os.environ["CPU_VISIBLE_MEMORY_NODES"])
except KeyError:
allowed_nodes = None
lscpu: dict[str, dict] = {"cpus": {}, "cores": {}, "nodes": {}}
for cpu in resource_map["cpus"]:
cpunum = int(cpu["cpu"])
if (
cpunum in allowed
and cpunum >= 0
and (allowed_nodes is None or _int(cpu["node"]) in allowed_nodes)
):
lscpu["cpus"][cpunum] = [cpu]
core = _int(cpu["core"])
if lscpu["cores"].get(core, None) is None:
lscpu["cores"][core] = [cpu]
else:
lscpu["cores"][core].append(cpu)
node = _int(cpu["node"])
if lscpu["nodes"].get(node, None) is None:
lscpu["nodes"][node] = [cpu]
else:
lscpu["nodes"][node].append(cpu)
return lscpu
def produce_cpu_list(cpus, smt=1):
"""Produce a CPU list with/without SMT pairs - main cpu list case"""
mask: list[int] = []
for key, value in cpus.items():
exists = 0
for cpu in mask:
if cpu == value[0]["core"]:
exists += 1
break
if exists < smt:
mask.append(int(key))
return {"mask": set(mask), "available": True}
def produce_cpu_sublist(scpus, smt=1):
"""Produce a CPU list with/without SMT pairs - resource leaf case"""
cpu_list: list[dict] = []
for value in scpus:
exists = 0
for cpu in cpu_list:
if int(cpu["core"]) == int(value["core"]):
exists += 1
break
if exists < smt:
cpu_list.append(value)
mask = []
for cpu in cpu_list:
mask.append(int(cpu["cpu"]))
return {"mask": set(mask), "available": True}
def create_omp_places(resources, strategy, smt=True):
"""Parse CPU topology and generate possible CPU masks"""
omp_places = []
if strategy == "all":
omp_places.append(produce_cpu_list(resources["cpus"], smt))
elif strategy == "cores":
for value in resources["cores"].values():
omp_places.append(produce_cpu_sublist(value, smt))
elif strategy == "nodes":
for value in resources["nodes"].values():
omp_places.append(produce_cpu_sublist(value, smt))
else:
raise NotImplementedError("Unknown strategy")
return omp_places
# pylint: disable=too-few-public-methods
class OMPProcessManager: class OMPProcessManager:
"""OMP aware wrapper to run mp Process()""" def __init__(self, config: "VllmConfig"):
if not current_platform.is_cpu():
def __init__(self, strategy="nodes", smt=1, mock=None, affinity=None): return
self.strategy = strategy
self.smt = smt self.local_world_size = config.parallel_config.local_world_size
self.omp_places = [] self.local_dp_rank = config.parallel_config.data_parallel_rank_local
vllm_mask = os.environ.get("VLLM_CPU_OMP_THREADS_BIND", None) # This is a bit tricky because the internal DP size
self.setup_omp = vllm_mask != "nobind" # is always 1 for non-MoE models
if self.setup_omp: self.internal_dp_size = config.parallel_config._api_process_count
omp_places = []
if vllm_mask is not None: self.simulate_multi_node = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0"
masks = [] ld_preload_str = os.getenv("LD_PRELOAD", "")
for spec in vllm_mask.split("|"): self.use_iomp = "libiomp" in ld_preload_str or "libomp" in ld_preload_str
masks.append(parse_mask(spec)) self.use_gomp = "libgomp" in ld_preload_str
assert not (self.use_iomp and self.use_gomp)
# at least reserve 1/local_world_size(for ARM) core for scheduler
# proc as always use MP executor
# TODO: make scheduler proc sleep when idle
self.reserve_cpu_num = (
self.local_world_size
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM
else 1
)
# reserve at one more core for nixl_connector under p/d case
if config.kv_transfer_config:
self.reserve_cpu_num += 1
if envs.VLLM_CPU_NUM_OF_RESERVED_CPU is not None:
if self.reserve_cpu_num > envs.VLLM_CPU_NUM_OF_RESERVED_CPU:
msg = (
f"VLLM_CPU_NUM_OF_RESERVED_CPU is less than "
"the minimum requirement"
f": {self.reserve_cpu_num} cores"
)
logger.warning(msg=msg)
self.reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
self._parse_omp_threads_bind_env()
assert not self.simulate_multi_node or self.auto_setup
@contextmanager
def configure_omp_envs(self, rank: int, local_rank: int):
if not current_platform.is_cpu() or self.skip_setup:
yield
return
envs_dict = {}
cpu_list = [str(i) for i in self.cpu_lists[local_rank]]
envs_dict["OMP_NUM_THREADS"] = str(len(cpu_list))
if self.use_iomp:
# set IOMP envs
cpu_list_str = ",".join(cpu_list)
envs_dict["KMP_AFFINITY"] = (
f"granularity=fine,explicit,proclist=[{cpu_list_str}]"
)
# The time(milliseconds) that a thread should wait after
# completing the execution of a parallel region, before sleeping.
envs_dict["KMP_BLOCKTIME"] = "1"
# Prevents the CPU to run into low performance state
envs_dict["KMP_TPAUSE"] = "0"
# Provides fine granularity parallelism
envs_dict["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
envs_dict["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
envs_dict["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
elif self.use_gomp:
# set GOMP envs
# likes '0 1 2 ...'
cpu_list_str = " ".join(cpu_list)
envs_dict["GOMP_CPU_AFFINITY"] = cpu_list_str
else:
# set OMP envs
# likes '{0,1,2,...}'
cpu_list_str = ",".join(cpu_list)
envs_dict["OMP_PLACES"] = f"{{{cpu_list_str}}}"
envs_dict["OMP_PROC_BIND"] = "true"
# backup envs
old_envs_dict = {}
for k in envs_dict:
old_envs_dict[k] = os.environ.get(k)
try:
# set envs
for k, v in envs_dict.items():
os.environ[k] = v
yield
finally:
# restore old envs
for k, v in old_envs_dict.items(): # type: ignore
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
def _parse_omp_threads_bind_env(self):
vllm_mask = envs.VLLM_CPU_OMP_THREADS_BIND
self.skip_setup = vllm_mask == "nobind"
self.auto_setup = vllm_mask == "auto"
self.reserved_cpu_list = []
self.cpu_lists = []
if self.auto_setup:
# auto generate CPU lists
cpu_arch = current_platform.get_cpu_architecture()
if cpu_arch == CpuArchEnum.POWERPC:
# For POWERPC SMT-8/4/2
cpu_list, reserve_list = self._get_autobind_cpu_ids(
lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
)
elif cpu_arch in (CpuArchEnum.X86, CpuArchEnum.S390X):
# For x86/S390X SMT-2, use 1 logical CPU per physical core
cpu_list, reserve_list = self._get_autobind_cpu_ids(
lambda cpus: cpus[-1:]
)
elif cpu_arch == CpuArchEnum.ARM:
# For AArch64, no SMT, use all logical CPU
cpu_list, reserve_list = self._get_autobind_cpu_ids(lambda cpus: cpus)
else: else:
masks = [None] cpu_list, reserve_list = [], []
if mock is None: raise RuntimeError(f"{cpu_arch} doesn't support auto CPU binding.")
data = _get_cpu_topology_json()
for item in cpu_list:
self.cpu_lists.append([x.id for x in item])
self.reserved_cpu_list = [x.id for x in reserve_list]
elif not self.skip_setup:
# user defined CPU lists
omp_cpuids_list = vllm_mask.split("|")
if self.local_dp_rank is not None:
local_dp_rank = self.local_dp_rank
world_size = self.local_world_size
# Rank mapping [DP, PP, TP]
omp_cpuids_list = omp_cpuids_list[
local_dp_rank * world_size : (local_dp_rank + 1) * world_size
]
assert len(omp_cpuids_list) == self.local_world_size, (
"Given "
f"number of CPU id list {omp_cpuids_list} doesn't match "
f"local world size {self.local_world_size}."
)
# parse CPU list strings like "5,2-4" to [5, 2, 3, 4]
self.cpu_lists = [cr_utils.parse_id_list(s) for s in omp_cpuids_list]
else:
# skip
self.cpu_lists = []
msg = "OpenMP thread binding info: \n"
for i in range(self.local_world_size):
msg += f"\tlocal_rank={i}, core ids={self.cpu_lists[i]}\n"
msg += f"\treserved_cpus={self.reserved_cpu_list}"
logger.info(msg)
def _get_autobind_cpu_ids(
self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
) -> tuple[list[list[LogicalCPUInfo]], list[LogicalCPUInfo]]:
"""
Return CPU ids to bind based on NUMA nodes, and CPU ids reserved for
other processes.
Currently for rank N, only CPU ids on the N-th node in available NUMA
node list will be selected.
Args:
cpu_selector: a callable object to select CPUs from a CPU list
of a physical core. The input is a LogicalCPUInfo list contains
logical CPUs of a physical CPU, sorted by the LogicalCPUInfo.id.
A selected LogicalCPUInfo list should be returned.
"""
# this memory node list has been sliced for DP offset
allowed_numa_nodes = cr_utils.get_visible_memory_node()
logical_cpu_list = cr_utils.get_allowed_cpu_list()
local_world_size = self.local_world_size
assert (
len(allowed_numa_nodes) >= local_world_size or self.simulate_multi_node
), (
f"Not enough allowed NUMA nodes to bind threads of "
f"{local_world_size} local CPUWorkers. "
f"Allowed NUMA nodes are {allowed_numa_nodes}. "
"Please try to bind threads manually or decrease DP/TP/PP."
)
# Generate OMP CPU list for each rank
cpu_lists_of_ranks = []
reserved_cpu_list = []
total_cpu_num = 0
for local_rank in range(self.local_world_size):
if not self.simulate_multi_node:
selected_numa_node = allowed_numa_nodes[local_rank]
selected_logical_cpu_list = [
x for x in logical_cpu_list if x.numa_node == selected_numa_node
]
else: else:
with open(mock, mode="rb") as jf: world_size_across_dp = self.local_world_size * self.internal_dp_size
data = jf.read() assert len(logical_cpu_list) >= world_size_across_dp
lscpu = json.loads(data) selected_logical_cpu_list = sorted(
for mask in masks: logical_cpu_list, key=lambda x: x.numa_node
resources = enumerate_resources(lscpu, mask, affinity) )
omp_places.extend(create_omp_places(resources, strategy, smt)) sim_cpu_num_per_node = (
self.omp_places = sorted( len(selected_logical_cpu_list) // world_size_across_dp
omp_places, )
key=lambda p: "{:04d}-{:04d}".format(len(p["mask"]), max(p["mask"])), assert self.local_dp_rank is not None
reverse=True, start_idx = (
local_rank + self.local_world_size * self.local_dp_rank
) * sim_cpu_num_per_node
selected_logical_cpu_list = selected_logical_cpu_list[
start_idx : (start_idx + sim_cpu_num_per_node)
]
# Select logical CPUs on same physical cores via cpu_selector
core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}
for cpu_info in selected_logical_cpu_list:
if cpu_info.physical_core not in core_to_cpus:
core_to_cpus[cpu_info.physical_core] = []
core_to_cpus[cpu_info.physical_core].append(cpu_info)
selected_logical_cpu_list = []
for cpu_list in core_to_cpus.values():
cpu_list = sorted(cpu_list, key=lambda x: x.id)
selected_logical_cpu_list.extend(cpu_selector(cpu_list))
# sort selected cores based on core id
selected_logical_cpu_list = sorted(
selected_logical_cpu_list, key=lambda x: x.id
) )
def run(self, what, *args, **kwargs): cpu_lists_of_ranks.append(selected_logical_cpu_list)
"""Run arg with correct OMP environment""" total_cpu_num += len(selected_logical_cpu_list)
if self.setup_omp:
for place in self.omp_places: # Reserve CPUs for other processes
if place["available"]: if total_cpu_num <= self.reserve_cpu_num:
reserve = int(os.environ.get("VLLM_CPU_NUM_OF_RESERVED_CPU", 0)) logger.warning(
place["available"] = False "Selected CPU core number (%s) "
# pylint: disable=consider-using-f-string "should be greater than reserved CPU core "
os.environ["OMP_PLACES"] = "{}".format(place["mask"]) "number (%s).",
os.environ["OMP_NUM_THREADS"] = "{}".format( total_cpu_num,
len(place["mask"]) - reserve self.reserve_cpu_num,
) )
os.environ["OMP_PROC_BIND"] = "TRUE" return cpu_lists_of_ranks, []
return what(*args, **kwargs)
raise IndexError("Out of OMP places") reserve_num_per_rank = [
return what(*args, **kwargs) self.reserve_cpu_num // self.local_world_size
] * self.local_world_size
# last rank first
for i in range(
self.local_world_size - 1,
self.local_world_size - 1 - self.reserve_cpu_num % self.local_world_size,
-1,
):
reserve_num_per_rank[i] += 1
for i in range(self.local_world_size):
num = reserve_num_per_rank[i]
if num > 0:
reserved_cpu_list.extend(cpu_lists_of_ranks[i][-num:])
cpu_lists_of_ranks[i] = cpu_lists_of_ranks[i][:-num]
return cpu_lists_of_ranks, reserved_cpu_list
...@@ -51,6 +51,7 @@ from vllm.utils.network_utils import ( ...@@ -51,6 +51,7 @@ from vllm.utils.network_utils import (
get_loopback_ip, get_loopback_ip,
get_open_port, get_open_port,
) )
from vllm.utils.ompmultiprocessing import OMPProcessManager
from vllm.utils.system_utils import ( from vllm.utils.system_utils import (
_maybe_force_spawn, _maybe_force_spawn,
decorate_logs, decorate_logs,
...@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor): ...@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor):
[] if context.get_start_method() == "fork" else None [] if context.get_start_method() == "fork" else None
) )
# For CPU backend only, to setup OpenMP threads affinity
cpu_omp_manager = OMPProcessManager(self.vllm_config)
for local_rank in range(self.local_world_size): for local_rank in range(self.local_world_size):
global_rank = global_start_rank + local_rank global_rank = global_start_rank + local_rank
is_driver_worker = self._is_driver_worker(global_rank) is_driver_worker = self._is_driver_worker(global_rank)
if current_platform.is_cpu(): with cpu_omp_manager.configure_omp_envs(
om = current_platform.get_omp_manager() rank=global_rank, local_rank=local_rank
logger.info("Configured OMP PLACES %s", str(om.omp_places)) ):
unready_worker_handle = om.run(
WorkerProc.make_worker_process,
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
inherited_fds=inherited_fds,
)
else:
unready_worker_handle = WorkerProc.make_worker_process( unready_worker_handle = WorkerProc.make_worker_process(
vllm_config=self.vllm_config, vllm_config=self.vllm_config,
local_rank=local_rank, local_rank=local_rank,
......
...@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner): ...@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Warming up model for the compilation...") logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape # Only generate graph for the generic shape
with _set_global_compilation_settings(self.vllm_config): with _set_global_compilation_settings(self.vllm_config):
self._dummy_run( self.profile_run()
min(
max(16, self.max_num_reqs),
self.scheduler_config.max_num_batched_tokens,
)
)
# Warm up drafter for speculative decoding
if self.speculative_config and (self.speculative_config.uses_draft_model()):
from vllm.v1.spec_decode.draft_model import DraftModelProposer
if isinstance(self.drafter, (DraftModelProposer)):
logger.info("Warming up drafter model...")
self.drafter.dummy_run(max(16, self.max_num_reqs))
logger.info("Warming up done.") logger.info("Warming up done.")
def initialize_kv_cache( def initialize_kv_cache(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import os import os
import sys import sys
from typing import Any from typing import Any
import psutil
import torch import torch
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms import CpuArchEnum, current_platform
from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.profiler.wrapper import TorchProfilerWrapper
from vllm.utils.cpu_resource_utils import (
get_allowed_cpu_list,
get_memory_node_info,
get_visible_memory_node,
)
from vllm.utils.mem_utils import format_gib
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.cpu_model_runner import CPUModelRunner
from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
...@@ -27,6 +35,46 @@ class CPUWorker(Worker): ...@@ -27,6 +35,46 @@ class CPUWorker(Worker):
distributed_init_method: str, distributed_init_method: str,
is_driver_worker: bool = False, is_driver_worker: bool = False,
): ):
# TODO: use numactl for process setup
# TODO: optimize for `interleaved` policy
# Bind memory node
allowed_memory_nodes = get_visible_memory_node()
allowed_cpu_list = get_allowed_cpu_list()
cpu_core = allowed_cpu_list[0]
# TODO: some CI hosts are not correctly set, change to assertion
# after fix
if cpu_core.numa_node not in allowed_memory_nodes:
logger.warning(
"Node %s is not in available memory nodes %s.",
cpu_core.numa_node,
allowed_memory_nodes,
)
torch.ops._C.init_cpu_memory_env([cpu_core.numa_node])
memory_status = get_memory_node_info(cpu_core.numa_node)
memory_fraction = vllm_config.cache_config.gpu_memory_utilization
self.requested_cpu_memory = math.ceil(
memory_status.total_memory * memory_fraction
)
available_memory = memory_status.available_memory
if (
vllm_config.cache_config.kv_cache_memory_bytes is None
and self.requested_cpu_memory > available_memory
):
raise ValueError(
f"Available memory on node {cpu_core.numa_node} "
f"({format_gib(available_memory)}/"
f"{format_gib(memory_status.total_memory)} GiB) on startup "
f"is less than desired CPU memory utilization "
f"({vllm_config.cache_config.gpu_memory_utilization}, "
f"{format_gib(self.requested_cpu_memory)} GiB). "
"Decrease --gpu-memory-utilization"
f" or reduce CPU memory used by other processes."
)
super().__init__( super().__init__(
vllm_config, vllm_config,
local_rank, local_rank,
...@@ -103,13 +151,69 @@ class CPUWorker(Worker): ...@@ -103,13 +151,69 @@ class CPUWorker(Worker):
pass pass
def determine_available_memory(self) -> int: def determine_available_memory(self) -> int:
return self.cache_config.cpu_kvcache_space_bytes or 0 self.model_runner.warming_up_model()
allowed_cpu_list = get_allowed_cpu_list()
cpu_core = allowed_cpu_list[0]
memory_status = get_memory_node_info(cpu_core.numa_node)
available_memory = memory_status.available_memory
explicit_kv_cache_size = self.cache_config.kv_cache_memory_bytes
kv_cache_size = None
msg = None
if explicit_kv_cache_size is not None:
if explicit_kv_cache_size > available_memory:
raise ValueError(
f"Available memory on node {cpu_core.numa_node} "
f"({format_gib(available_memory)}/"
f"{format_gib(memory_status.total_memory)} GiB) on kv cache"
f" allocation is less than requested memory for kv "
f"({format_gib(explicit_kv_cache_size)} GiB). "
"Decrease --kv-cache-memory-bytes, VLLM_CPU_KVCACHE_SPACE, "
"or reduce CPU memory used by other processes."
)
kv_cache_size = explicit_kv_cache_size
msg = (
f"Explicitly set ({format_gib(kv_cache_size)}/"
f"{format_gib(memory_status.total_memory)}) GiB for KV cache "
f"on node {cpu_core.numa_node}."
)
else:
consumed_memory = psutil.Process(os.getpid()).memory_info().rss
requested_memory_for_kv = int(self.requested_cpu_memory - consumed_memory)
if (
requested_memory_for_kv <= 0
or requested_memory_for_kv > available_memory
):
raise ValueError(
f"Available memory on node {cpu_core.numa_node} "
f"({format_gib(available_memory)}/"
f"{format_gib(memory_status.total_memory)} GiB) on kv cache"
f" allocation is less than requested memory for kv "
f"({format_gib(requested_memory_for_kv)}/"
f"{format_gib(self.requested_cpu_memory)} GiB). "
"Reduce CPU memory used by other processes."
)
kv_cache_size = requested_memory_for_kv
msg = (
f"Auto set ({format_gib(kv_cache_size)}/"
f"{format_gib(memory_status.total_memory)}) GiB for KV cache "
f"on node {cpu_core.numa_node}, with "
f"{format_gib(self.requested_cpu_memory)} GiB requested memory"
f" for the worker. {format_gib(consumed_memory)} GiB"
f" memory was consumed by non-kv usages."
)
logger.info(msg)
return kv_cache_size
def compile_or_warm_up_model(self) -> CompilationTimes: def compile_or_warm_up_model(self) -> CompilationTimes:
# Reset the seed to ensure that the random state is not affected by # Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling. # the model initialization and profiling.
set_random_seed(self.model_config.seed) set_random_seed(self.model_config.seed)
self.model_runner.warming_up_model() # Note: the model has been compiled in determine_available_memory()
return CompilationTimes( return CompilationTimes(
language_model=self.compilation_config.compilation_time, language_model=self.compilation_config.compilation_time,
encoder=self.compilation_config.encoder_compilation_time, encoder=self.compilation_config.encoder_compilation_time,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment