[CPU] Refactor CPU affinity and memory management (#39781)

Signed-off-by: jiang1.li <jiang1.li@intel.com>

[CPU] Refactor CPU affinity and memory management (#39781)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
d02421a7 · Li, Jiang · GitHub · b1dc87a0 · d02421a7 · d02421a7
Unverified Commit d02421a7 authored Apr 17, 2026 by Li, Jiang Committed by GitHub Apr 17, 2026
14 changed files
--- a/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
@@ -23,22 +23,22 @@ if [ "$failed_req" -ne 0 ]; then
  exit 1
 fi
-#echo "--- DP+TP"
+echo "--- DP+TP"
-#vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
+vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 --max-model-len=4096 &
-#server_pid=$!
+server_pid=$!
-#timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
+timeout 600 bash -c "until curl localhost:8000/v1/models > /dev/null 2>&1; do sleep 1; done" || exit 1
-#vllm bench serve \
+vllm bench serve \
-#    --backend vllm \
+   --backend vllm \
-#    --dataset-name random \
+   --dataset-name random \
-#    --model meta-llama/Llama-3.2-3B-Instruct \
+   --model meta-llama/Llama-3.2-3B-Instruct \
-#    --num-prompts 20 \
+   --num-prompts 20 \
-#    --result-dir ./test_results \
+   --result-dir ./test_results \
-#    --result-filename dp_pp.json \
+   --result-filename dp_pp.json \
-#    --save-result \
+   --save-result \
-#    --endpoint /v1/completions
+   --endpoint /v1/completions
-#kill -s SIGTERM $server_pid; wait $server_pid || true
+kill -s SIGTERM $server_pid; wait $server_pid || true
-#failed_req=$(jq '.failed' ./test_results/dp_pp.json)
+failed_req=$(jq '.failed' ./test_results/dp_pp.json)
-#if [ "$failed_req" -ne 0 ]; then
+if [ "$failed_req" -ne 0 ]; then
-#  echo "Some requests were failed!"
+ echo "Some requests were failed!"
-#  exit 1
+ exit 1
-#fi
+fi
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
@@ -45,6 +45,7 @@ jobs:
      - name: Smoke test vllm serve
        run: |
          # Start server in background
+          VLLM_CPU_KVCACHE_SPACE=1 \
          vllm serve Qwen/Qwen3-0.6B \
            --max-model-len=2K \
            --load-format=dummy \

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -30,6 +30,21 @@ else()
    list(APPEND CXX_COMPILE_FLAGS
        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
+    # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+    # and create a local shim dir with it
+    vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+    find_library(OPEN_MP
+        NAMES gomp
+        PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
+        NO_DEFAULT_PATH
+        REQUIRED
+    )
+    # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
+    if (OPEN_MP)
+        set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+    endif()
 endif()
 if (NOT MACOSX_FOUND)
@@ -175,20 +190,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
        if(NOT NPROC)
            set(NPROC 4)
        endif()
-        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-        # and create a local shim dir with it
-        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
-        find_library(OPEN_MP
-            NAMES gomp
-            PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
-            NO_DEFAULT_PATH
-            REQUIRED
-        )
-        # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
-        if (OPEN_MP)
-            set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
-        endif()
        # Fetch and populate ACL
        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")

--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -141,6 +141,8 @@ void compute_slot_mapping_kernel_impl(const torch::Tensor query_start_loc,
                                      torch::Tensor slot_mapping,
                                      const int64_t block_size);
+void init_cpu_memory_env(std::vector<int64_t> node_ids);
 namespace cpu_utils {
 void eagle_prepare_inputs_padded_kernel_impl(
    const torch::Tensor& cu_num_draft_tokens,
@@ -431,6 +433,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "block_size) -> ()",
      &compute_slot_mapping_kernel_impl);
+  ops.def("init_cpu_memory_env(SymInt[] node_ids) -> ()", &init_cpu_memory_env);
  // Speculative decoding kernels
  ops.def(
      "eagle_prepare_inputs_padded_kernel_impl(Tensor cu_num_draft_tokens, "

--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -13,13 +13,80 @@
 #include "cpu/utils.hpp"
 #ifdef VLLM_NUMA_DISABLED
-std::string init_cpu_threads_env(const std::string& cpu_ids) {
+void init_cpu_memory_env(std::vector<int64_t> node_ids) {}
-  return std::string(
+#else
-      "Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
+void init_cpu_memory_env(std::vector<int64_t> node_ids) {
-      "no effect to setup thread affinity.");
+  // Memory node binding
-}
+  if (numa_available() != -1) {
+    // Concatenate all node_ids into a single comma-separated string
+    if (!node_ids.empty()) {
+      std::string node_ids_str;
+      for (const int node_id : node_ids) {
+        if (!node_ids_str.empty()) {
+          node_ids_str += ",";
+        }
+        node_ids_str += std::to_string(node_id);
+      }
-#endif
+      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
+      bitmask* src_mask = numa_get_mems_allowed();
+      int pid = getpid();
+      if (mask && src_mask) {
+        // move all existing pages to the specified numa node.
+        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+        int page_num = numa_migrate_pages(pid, src_mask, mask);
+        if (page_num == -1) {
+          TORCH_WARN("numa_migrate_pages failed. errno: " +
+                     std::to_string(errno));
+        }
+        // Restrict memory allocation to the selected NUMA node(s).
+        // Enhances memory locality for the threads bound to those NUMA CPUs.
+        if (node_ids.size() > 1) {
+          errno = 0;
+          numa_set_interleave_mask(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_interleave_mask failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using INTERLEAVE policy for memory "
+                "allocation across multiple NUMA nodes (nodes: " +
+                node_ids_str +
+                "). Memory allocations will be "
+                "interleaved across the specified NUMA nodes.");
+          }
+        } else {
+          errno = 0;
+          numa_set_membind(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_membind failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using MEMBIND policy for memory "
+                "allocation on the NUMA nodes (" +
+                node_ids_str +
+                "). Memory allocations will be "
+                "strictly bound to these NUMA nodes.");
+          }
+        }
+        numa_set_strict(1);
+        numa_free_nodemask(mask);
+        numa_free_nodemask(src_mask);
+      } else {
+        TORCH_WARN(
+            "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
+            std::to_string(errno));
+      }
+    }
+  }
+}
+#endif  // VLLM_NUMA_DISABLED
 namespace cpu_utils {
 ScratchPadManager::ScratchPadManager() : size_(0), ptr_(nullptr) {

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -173,7 +173,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 COPY --from=vllm-test-deps /vllm-workspace/requirements/test/cpu.txt requirements/test/cpu.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt && \
+    uv pip install -r requirements/lint.txt && \
+    uv pip install -r requirements/test/cpu.txt && \
    pre-commit install --hook-type pre-commit --hook-type commit-msg
 ENTRYPOINT ["bash"]

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -46,7 +46,7 @@ AITER_MODEL_LIST = [
        ),
        pytest.param(
            "openai-community/gpt2",  # gpt2
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
        ),
        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
@@ -143,11 +143,6 @@ def test_models(
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
-    if current_platform.is_cpu() and model in ("openai-community/gpt2",):
-        # These models are sensitive to the rounding error
-        # Fuse ops to reduce rounding
-        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs

--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -101,8 +101,6 @@ class CacheConfig:
    kv_cache_dtype_skip_layers: list[str] = field(default_factory=list)
    """Layer patterns to skip KV cache quantization. Accepts layer indices
    (e.g., '0', '2', '4') or attention type names (e.g., 'sliding_window')."""
-    cpu_kvcache_space_bytes: int | None = None
-    """(CPU backend only) CPU key-value cache space."""
    mamba_page_size_padded: int | None = None
    """ Optional override for mamba page size; used by hybrid mamba/attention
    models to ensure exact alignment with attention page size."""
@@ -183,7 +181,6 @@ class CacheConfig:
            "num_gpu_blocks_override",
            "enable_prefix_caching",
            "prefix_caching_hash_algo",
-            "cpu_kvcache_space_bytes",
            "mamba_page_size_padded",
            "user_specified_block_size",
            "user_specified_mamba_block_size",

--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -6,15 +6,16 @@ import os
 import platform
 import subprocess
 import sys
-from dataclasses import dataclass
 from typing import TYPE_CHECKING
-import psutil
 import torch
-from vllm import envs
 from vllm.logger import init_logger
-from vllm.utils.ompmultiprocessing import OMPProcessManager
+from vllm.utils.cpu_resource_utils import (
+    DEVICE_CONTROL_ENV_VAR,
+    get_memory_node_info,
+)
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.torch_utils import is_quantized_kv_cache
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -38,49 +39,13 @@ def get_max_threads(pid=0):
        raise NotImplementedError("Unsupported OS")
-@dataclass
-class LogicalCPUInfo:
-    id: int = -1
-    physical_core: int = -1
-    numa_node: int = -1
-    @classmethod
-    def _int(cls, value: str) -> int:
-        try:
-            int_value = int(value)
-        except Exception:
-            int_value = -1
-        return int_value
-    @staticmethod
-    def json_decoder(obj_dict: dict):
-        id = obj_dict.get("cpu")
-        physical_core = obj_dict.get("core")
-        numa_node = obj_dict.get("node")
-        if not (id is None or physical_core is None or numa_node is None):
-            return LogicalCPUInfo(
-                id=LogicalCPUInfo._int(id),
-                physical_core=LogicalCPUInfo._int(physical_core),
-                numa_node=LogicalCPUInfo._int(numa_node),
-            )
-        else:
-            return obj_dict
 class CpuPlatform(Platform):
    _enum = PlatformEnum.CPU
    device_name: str = "cpu"
    device_type: str = "cpu"
    dispatch_key: str = "CPU"
    dist_backend: str = "gloo"
-    device_control_env_var = "CPU_VISIBLE_MEMORY_NODES"
+    device_control_env_var = DEVICE_CONTROL_ENV_VAR
-    omp_process_manager = None
-    # Simultaneous Multithreading (SMT) level for OpenMP:
-    # 4 on PowerPC, 1 on non-PowerPC architectures
-    smt = 1
-    global_cpu_mask = None
-    simulate_numa = int(os.environ.get("_SIM_MULTI_NUMA", 0))
    @property
    def supported_dtypes(self) -> list[torch.dtype]:
@@ -123,29 +88,9 @@ class CpuPlatform(Platform):
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
-        from vllm.utils.mem_constants import GiB_bytes
+        meminfo = get_memory_node_info(device_id)
-        from vllm.utils.mem_utils import format_gib
-        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
-        node_dir = "/sys/devices/system/node"
-        if kv_cache_space is None:
-            nodes = (
-                [d for d in os.listdir(node_dir) if d.startswith("node")]
-                if os.path.exists(node_dir)
-                else []
-            )
-            num_numa_nodes = len(nodes) or 1
-            free_cpu_memory = psutil.virtual_memory().total // num_numa_nodes
-            DEFAULT_CPU_MEM_UTILIZATION = 0.5
-            kv_cache_space = int(free_cpu_memory * DEFAULT_CPU_MEM_UTILIZATION)
-            logger.warning_once(
-                "VLLM_CPU_KVCACHE_SPACE not set. Using %s GiB for KV cache.",
-                format_gib(kv_cache_space),
-            )
-        else:
-            kv_cache_space *= GiB_bytes
-        return kv_cache_space
+        return meminfo.total_memory
    @classmethod
    def set_device(cls, device: torch.device) -> None:
@@ -180,6 +125,12 @@ class CpuPlatform(Platform):
                "otherwise the performance is not optimized."
            )
+        # Lagecy setting
+        env_key = "VLLM_CPU_KVCACHE_SPACE"
+        if env_key in os.environ and os.environ[env_key] != "":
+            kv_cache_space = int(os.environ[env_key])
+            cache_config.kv_cache_memory_bytes = kv_cache_space * GiB_bytes
        scheduler_config = vllm_config.scheduler_config
        # async scheduling is not required on CPU
        scheduler_config.async_scheduling = False
@@ -198,8 +149,6 @@ class CpuPlatform(Platform):
            )
            cache_config.cache_dtype = "auto"
-        cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory()
        parallel_config = vllm_config.parallel_config
        # OMP requires the MP executor to function correctly, UniProc is not
        # supported as it is not possible to set the OMP environment correctly
@@ -278,21 +227,45 @@ class CpuPlatform(Platform):
        os.environ["TORCHINDUCTOR_CPP_DYNAMIC_THREADS"] = "1"
        ld_preload_str = os.getenv("LD_PRELOAD", "")
-        # Intel and CLANG OpenMP setting
-        if "libiomp5.so" in ld_preload_str or "libomp5" in ld_preload_str:
-            # The time(milliseconds) that a thread should wait after
-            # completing the execution of a parallel region, before sleeping.
-            os.environ["KMP_BLOCKTIME"] = "1"
-            # Prevents the CPU to run into low performance state
-            os.environ["KMP_TPAUSE"] = "0"
-            # Provides fine granularity parallelism
-            os.environ["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
-            os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
-            os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
        cpu_architecture = Platform.get_cpu_architecture()
+        if (
+            platform.system() == "Linux"
+            and cpu_architecture
+            in (CpuArchEnum.ARM, CpuArchEnum.POWERPC, CpuArchEnum.X86)
+            and not (
+                "libomp" in ld_preload_str
+                or "libgomp" in ld_preload_str
+                or "libiomp" in ld_preload_str
+            )
+        ):
+            # We need to LD_PRELOAD PyTorch's libgomp, otherwise only
+            # one core will be properly utilized when we thread-bind
+            # See: https://github.com/vllm-project/vllm/issues/27369
+            # TODO: Remove once:
+            # https://github.com/pytorch/pytorch/issues/166087 is fixed
+            # We need to find the location of PyTorch's libgomp
+            torch_pkg = os.path.dirname(torch.__file__)
+            site_root = os.path.dirname(torch_pkg)
+            # Search both torch.libs and torch/lib - See:
+            # https://github.com/vllm-project/vllm/issues/30470
+            torch_libs_paths = [
+                os.path.join(site_root, "torch.libs"),
+                os.path.join(torch_pkg, "lib"),
+            ]
+            pytorch_libgomp_so_candidates = []
+            for torch_libs in torch_libs_paths:
+                pytorch_libgomp_so_candidates.extend(
+                    glob.glob(os.path.join(torch_libs, "libgomp*.so*"))
+                )
+            if pytorch_libgomp_so_candidates:
+                pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
+                if ld_preload_str:
+                    ld_preload_str += ":"
+                ld_preload_str += pytorch_libgomp_so
+                os.environ["LD_PRELOAD"] = ld_preload_str
        # LD_PRELOAD libtcmalloc, bundled under vllm/libs to reduce
        # memory allocation overhead
        if (
@@ -331,13 +304,6 @@ class CpuPlatform(Platform):
                vllm_config.model_config.max_model_len,
                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
            )
-        # CI specific "quick" NUMA simulation - split all available CPUs
-        # into a fake NUMA topology
-        if os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", None) is not None:
-            os.environ["_SIM_MULTI_NUMA"] = str(
-                vllm_config.parallel_config.world_size
-                * vllm_config.parallel_config._api_process_count
-            )
    @classmethod
    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
@@ -345,78 +311,6 @@ class CpuPlatform(Platform):
        # Move that logic here so block_size is chosen by the backend.
        pass
-    @classmethod
-    def get_omp_manager(cls) -> OMPProcessManager:
-        # initialise the OMP resource management if need be and return the manager
-        if cls.omp_process_manager is None:
-            if cls.get_cpu_architecture() == CpuArchEnum.POWERPC:
-                cls.smt = 4
-            cls.omp_process_manager = OMPProcessManager(
-                affinity=cls.get_global_cpu_mask(), smt=cls.smt
-            )
-            # we need to fix up the topology returned by the OMP Manager for
-            # simulated NUMA environments in CI
-            if cls.simulate_numa > 0:
-                logger.info(
-                    "Adjusting numa topology to resemble at least %d nodes",
-                    int(cls.simulate_numa),
-                )
-                om = cls.omp_process_manager
-                while len(om.omp_places) < cls.simulate_numa:
-                    new_omp_places = []
-                    touched = False
-                    for omp_place in om.omp_places:
-                        if len(omp_place["mask"]) > 1:
-                            touched = True
-                            cpu_list = sorted(list(omp_place["mask"]))
-                            new_omp_places.append(
-                                {
-                                    "mask": set(cpu_list[0 : int(len(cpu_list) / 2)]),
-                                    "available": True,
-                                }
-                            )
-                            new_omp_places.append(
-                                {
-                                    "mask": set(cpu_list[int(len(cpu_list) / 2) :]),
-                                    "available": True,
-                                }
-                            )
-                    if touched:
-                        om.omp_places = new_omp_places
-                    else:
-                        raise ValueError(
-                            "Cannot split the existing NUMA topology to match "
-                            "simulation requirements"
-                        )
-        return cls.omp_process_manager
-    @classmethod
-    def get_global_cpu_mask(cls) -> set[int]:
-        # get global cpu mask
-        if cls.global_cpu_mask is None:
-            if hasattr(os, "sched_getaffinity"):
-                cls.global_cpu_mask = os.sched_getaffinity(0)
-            else:
-                # macOS does not support sched_getaffinity
-                cpu_count = os.cpu_count() or 1
-                cls.global_cpu_mask = set(range(cpu_count))
-        return cls.global_cpu_mask
-    @classmethod
-    def reserve_cpus(cls, reserve: set[int]) -> bool:
-        # remove CPUs from global mask, for now there is no "release" mechanism
-        if cls.omp_process_manager is not None:
-            for place in cls.omp_process_manager.omp_places:
-                if not place["available"]:
-                    return False
-        cls.global_cpu_mask = cls.get_global_cpu_mask() - reserve
-        # reinitialize OMP resource management
-        cls.omp_process_manager = OMPProcessManager(
-            affinity=cls.global_cpu_mask, smt=cls.smt
-        )
-        return True
    @classmethod
    def discover_numa_topology(cls) -> list[list[int]]:
        """

--- a/vllm/utils/cpu_resource_utils.py
+++ b/vllm/utils/cpu_resource_utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import os
+import platform
+import subprocess
+from dataclasses import dataclass
+from functools import cache
+import psutil
+import regex as re
+DEVICE_CONTROL_ENV_VAR = "CPU_VISIBLE_MEMORY_NODES"
+@dataclass
+class LogicalCPUInfo:
+    id: int = -1
+    physical_core: int = -1
+    numa_node: int = -1
+    @classmethod
+    def _int(cls, value: str) -> int:
+        try:
+            int_value = int(value)
+        except Exception:
+            int_value = -1
+        return int_value
+    @staticmethod
+    def json_decoder(obj_dict: dict):
+        id = obj_dict.get("cpu")
+        physical_core = obj_dict.get("core")
+        numa_node = obj_dict.get("node")
+        if not (id is None or physical_core is None or numa_node is None):
+            return LogicalCPUInfo(
+                id=LogicalCPUInfo._int(id),
+                physical_core=LogicalCPUInfo._int(physical_core),
+                numa_node=LogicalCPUInfo._int(numa_node),
+            )
+        else:
+            return obj_dict
+@dataclass
+class MemoryNodeInfo:
+    total_memory: int = -1
+    available_memory: int = -1
+def get_memory_affinity(pid: int = 0) -> list[int]:
+    pid = os.getpid() if pid == 0 else pid
+    path = f"/proc/{pid}/status"
+    with open(path) as f:
+        for line in f:
+            if line.startswith("Mems_allowed_list:"):
+                # Extract the string part (e.g., "0-1,3")
+                raw_list = line.split(":")[1].strip()
+                return parse_id_list(raw_list)
+    return []
+def parse_id_list(raw_str: str) -> list[int]:
+    """Parses strings like '0-2,4,7-8' into [0, 1, 2, 4, 7, 8]"""
+    result: list[int] = []
+    if not raw_str:
+        return result
+    for part in raw_str.split(","):
+        if "-" in part:
+            start, end = map(int, part.split("-"))
+            result.extend(range(start, end + 1))
+        else:
+            result.append(int(part))
+    return sorted(list(set(result)))
+def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
+    if platform.system() == "Darwin":
+        # MacOS has no memory node
+        return MemoryNodeInfo(
+            total_memory=psutil.virtual_memory().total,
+            available_memory=psutil.virtual_memory().available,
+        )
+    meminfo_path = f"/sys/devices/system/node/node{node_id}/meminfo"
+    if not os.path.exists(meminfo_path):
+        raise RuntimeError(f"{meminfo_path} doesn't exit.")
+    meminfo = {}
+    with open(meminfo_path) as f:
+        for line in f:
+            # Each line looks like: "Node 0 MemTotal: 97421888 kB"
+            parts = line.split()
+            key = parts[2].rstrip(":")
+            # convert to Bytes
+            value = int(parts[3]) * 1024
+            meminfo[key] = value
+    total_memory = meminfo["MemTotal"]
+    free_memory = meminfo["MemFree"]
+    active_file_memory = meminfo["Active(file)"]
+    inactive_file_memory = meminfo["Inactive(file)"]
+    reclaimable_memory = meminfo["SReclaimable"]
+    available_memory = (
+        free_memory + active_file_memory + inactive_file_memory + reclaimable_memory
+    )
+    return MemoryNodeInfo(
+        total_memory=total_memory,
+        available_memory=available_memory,
+    )
+def get_allowed_cpu_list() -> list[LogicalCPUInfo]:
+    cpu_list = _get_cpu_list()
+    if platform.system() == "Darwin":
+        return cpu_list
+    global_allowed_cpu_id_list = os.sched_getaffinity(0)
+    logical_cpu_list = [x for x in cpu_list if x.id in global_allowed_cpu_id_list]
+    return logical_cpu_list
+def get_visible_memory_node() -> list[int]:
+    if platform.system() == "Darwin":
+        return [0]
+    allowed_memory_node_list = get_memory_affinity()
+    env_key = DEVICE_CONTROL_ENV_VAR
+    if (
+        ("VLLM_CPU_SIM_MULTI_NUMA" not in os.environ)
+        and env_key in os.environ
+        and os.environ[env_key] != ""
+    ):
+        visible_nodes = [int(s) for s in os.environ[env_key].split(",")]
+        visible_nodes = [
+            node for node in visible_nodes if node in allowed_memory_node_list
+        ]
+        return visible_nodes
+    return allowed_memory_node_list
+@cache
+def _get_cpu_list() -> list[LogicalCPUInfo]:
+    if platform.system() == "Darwin":
+        # For MacOS, no user-level CPU affinity and SMT, return all CPUs
+        cpu_count = os.cpu_count()
+        assert cpu_count
+        return [LogicalCPUInfo(i, i, 0) for i in range(cpu_count)]
+    lscpu_output = subprocess.check_output(
+        "lscpu -J -e=CPU,CORE,NODE", shell=True, text=True
+    )
+    # For platform without NUMA, replace '-' to '0'
+    lscpu_output = re.sub(r'"node":\s*-\s*(,|\n)', r'"node": 0\1', lscpu_output)
+    logical_cpu_list: list[LogicalCPUInfo] = json.loads(
+        lscpu_output, object_hook=LogicalCPUInfo.json_decoder
+    )["cpus"]
+    # Filter CPUs with invalid attributes
+    logical_cpu_list = [
+        x for x in logical_cpu_list if -1 not in (x.id, x.physical_core, x.numa_node)
+    ]
+    return logical_cpu_list
--- a/vllm/utils/ompmultiprocessing.py
+++ b/vllm/utils/ompmultiprocessing.py
@@ -5,196 +5,280 @@ Copyright (c) 2026 Red Hat Inc
 Copyright (c) 2026 Cambridge Greys Ltd
 """
-import json
 import os
-import platform
+from collections.abc import Callable
-import subprocess
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
-def _int(arg):
+import vllm.utils.cpu_resource_utils as cr_utils
-    """Relaxed parsing of ints which handles a - instead of a number.
+from vllm import envs
-    The lscpu json may contain that for nodes in some cases. If that
+from vllm.logger import init_logger
-    is the case we parse it to zero
+from vllm.platforms import CpuArchEnum, current_platform
-    """
+from vllm.utils.cpu_resource_utils import LogicalCPUInfo
-    try:
-        if int(arg) >= 0:
+if TYPE_CHECKING:
-            return int(arg)
+    from vllm.config import VllmConfig
-    except ValueError:
-        pass
+logger = init_logger(__name__)
-    return 0
-def parse_mask(mask):
-    """Expand a X-Y,Z list"""
-    result = []
-    for token in mask.split(","):
-        try:
-            start, finish = token.split("-")
-            if int(start) > int(finish):
-                raise IndexError("Invalid Indexes for cpu ranges")
-            for cpu in range(int(start), int(finish) + 1):
-                result.append(cpu)
-        except ValueError:
-            result.append(int(token))
-    return set(result)
-def _get_default_affinity() -> set[int]:
-    """Get the set of CPUs the process is allowed to run on."""
-    if hasattr(os, "sched_getaffinity"):
-        return os.sched_getaffinity(0)
-    # macOS does not support sched_getaffinity; fall back to cpu_count
-    cpu_count = os.cpu_count() or 1
-    return set(range(cpu_count))
-def _get_cpu_topology_json() -> bytes:
-    """Get CPU topology as JSON.
-    On Linux this uses ``lscpu -Je``.  On other platforms (e.g. macOS) we
-    synthesize a simple topology where every logical CPU is its own core
-    on NUMA node 0, which is sufficient for the OMP place-list builder.
-    """
-    if platform.system() == "Linux":
-        return subprocess.run(["lscpu", "-Je"], check=True, capture_output=True).stdout
-    # Fallback for non-Linux (macOS, etc.)
-    cpu_count = os.cpu_count() or 1
-    cpus = []
-    for i in range(cpu_count):
-        cpus.append({"cpu": str(i), "core": str(i), "node": "0"})
-    return json.dumps({"cpus": cpus}).encode()
-def enumerate_resources(resource_map, mask=None, allowed=None):
-    """Enumerate system resources"""
-    if allowed is None:
-        allowed = _get_default_affinity()
-    if mask is not None:
-        allowed = allowed & mask
-    try:
-        allowed_nodes = parse_mask(os.environ["CPU_VISIBLE_MEMORY_NODES"])
-    except KeyError:
-        allowed_nodes = None
-    lscpu: dict[str, dict] = {"cpus": {}, "cores": {}, "nodes": {}}
-    for cpu in resource_map["cpus"]:
-        cpunum = int(cpu["cpu"])
-        if (
-            cpunum in allowed
-            and cpunum >= 0
-            and (allowed_nodes is None or _int(cpu["node"]) in allowed_nodes)
-        ):
-            lscpu["cpus"][cpunum] = [cpu]
-            core = _int(cpu["core"])
-            if lscpu["cores"].get(core, None) is None:
-                lscpu["cores"][core] = [cpu]
-            else:
-                lscpu["cores"][core].append(cpu)
-            node = _int(cpu["node"])
-            if lscpu["nodes"].get(node, None) is None:
-                lscpu["nodes"][node] = [cpu]
-            else:
-                lscpu["nodes"][node].append(cpu)
-    return lscpu
-def produce_cpu_list(cpus, smt=1):
-    """Produce a CPU list with/without SMT pairs - main cpu list case"""
-    mask: list[int] = []
-    for key, value in cpus.items():
-        exists = 0
-        for cpu in mask:
-            if cpu == value[0]["core"]:
-                exists += 1
-                break
-        if exists < smt:
-            mask.append(int(key))
-    return {"mask": set(mask), "available": True}
-def produce_cpu_sublist(scpus, smt=1):
-    """Produce a CPU list with/without SMT pairs - resource leaf case"""
-    cpu_list: list[dict] = []
-    for value in scpus:
-        exists = 0
-        for cpu in cpu_list:
-            if int(cpu["core"]) == int(value["core"]):
-                exists += 1
-                break
-        if exists < smt:
-            cpu_list.append(value)
-    mask = []
-    for cpu in cpu_list:
-        mask.append(int(cpu["cpu"]))
-    return {"mask": set(mask), "available": True}
-def create_omp_places(resources, strategy, smt=True):
-    """Parse CPU topology and generate possible CPU masks"""
-    omp_places = []
-    if strategy == "all":
-        omp_places.append(produce_cpu_list(resources["cpus"], smt))
-    elif strategy == "cores":
-        for value in resources["cores"].values():
-            omp_places.append(produce_cpu_sublist(value, smt))
-    elif strategy == "nodes":
-        for value in resources["nodes"].values():
-            omp_places.append(produce_cpu_sublist(value, smt))
-    else:
-        raise NotImplementedError("Unknown strategy")
-    return omp_places
-# pylint: disable=too-few-public-methods
 class OMPProcessManager:
-    """OMP aware wrapper to run mp Process()"""
+    def __init__(self, config: "VllmConfig"):
+        if not current_platform.is_cpu():
-    def __init__(self, strategy="nodes", smt=1, mock=None, affinity=None):
+            return
-        self.strategy = strategy
-        self.smt = smt
+        self.local_world_size = config.parallel_config.local_world_size
-        self.omp_places = []
+        self.local_dp_rank = config.parallel_config.data_parallel_rank_local
-        vllm_mask = os.environ.get("VLLM_CPU_OMP_THREADS_BIND", None)
+        # This is a bit tricky because the internal DP size
-        self.setup_omp = vllm_mask != "nobind"
+        # is always 1 for non-MoE models
-        if self.setup_omp:
+        self.internal_dp_size = config.parallel_config._api_process_count
-            omp_places = []
-            if vllm_mask is not None:
+        self.simulate_multi_node = os.environ.get("VLLM_CPU_SIM_MULTI_NUMA", "0") != "0"
-                masks = []
+        ld_preload_str = os.getenv("LD_PRELOAD", "")
-                for spec in vllm_mask.split("|"):
+        self.use_iomp = "libiomp" in ld_preload_str or "libomp" in ld_preload_str
-                    masks.append(parse_mask(spec))
+        self.use_gomp = "libgomp" in ld_preload_str
+        assert not (self.use_iomp and self.use_gomp)
+        # at least reserve 1/local_world_size(for ARM) core for scheduler
+        # proc as always use MP executor
+        # TODO: make scheduler proc sleep when idle
+        self.reserve_cpu_num = (
+            self.local_world_size
+            if current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+            else 1
+        )
+        # reserve at one more core for nixl_connector under p/d case
+        if config.kv_transfer_config:
+            self.reserve_cpu_num += 1
+        if envs.VLLM_CPU_NUM_OF_RESERVED_CPU is not None:
+            if self.reserve_cpu_num > envs.VLLM_CPU_NUM_OF_RESERVED_CPU:
+                msg = (
+                    f"VLLM_CPU_NUM_OF_RESERVED_CPU is less than "
+                    "the minimum requirement"
+                    f": {self.reserve_cpu_num} cores"
+                )
+                logger.warning(msg=msg)
+            self.reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
+        self._parse_omp_threads_bind_env()
+        assert not self.simulate_multi_node or self.auto_setup
+    @contextmanager
+    def configure_omp_envs(self, rank: int, local_rank: int):
+        if not current_platform.is_cpu() or self.skip_setup:
+            yield
+            return
+        envs_dict = {}
+        cpu_list = [str(i) for i in self.cpu_lists[local_rank]]
+        envs_dict["OMP_NUM_THREADS"] = str(len(cpu_list))
+        if self.use_iomp:
+            # set IOMP envs
+            cpu_list_str = ",".join(cpu_list)
+            envs_dict["KMP_AFFINITY"] = (
+                f"granularity=fine,explicit,proclist=[{cpu_list_str}]"
+            )
+            # The time(milliseconds) that a thread should wait after
+            # completing the execution of a parallel region, before sleeping.
+            envs_dict["KMP_BLOCKTIME"] = "1"
+            # Prevents the CPU to run into low performance state
+            envs_dict["KMP_TPAUSE"] = "0"
+            # Provides fine granularity parallelism
+            envs_dict["KMP_FORKJOIN_BARRIER_PATTERN"] = "dist,dist"
+            envs_dict["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
+            envs_dict["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
+        elif self.use_gomp:
+            # set GOMP envs
+            # likes '0 1 2 ...'
+            cpu_list_str = " ".join(cpu_list)
+            envs_dict["GOMP_CPU_AFFINITY"] = cpu_list_str
+        else:
+            # set OMP envs
+            # likes '{0,1,2,...}'
+            cpu_list_str = ",".join(cpu_list)
+            envs_dict["OMP_PLACES"] = f"{{{cpu_list_str}}}"
+            envs_dict["OMP_PROC_BIND"] = "true"
+        # backup envs
+        old_envs_dict = {}
+        for k in envs_dict:
+            old_envs_dict[k] = os.environ.get(k)
+        try:
+            # set envs
+            for k, v in envs_dict.items():
+                os.environ[k] = v
+            yield
+        finally:
+            # restore old envs
+            for k, v in old_envs_dict.items():  # type: ignore
+                if v is None:
+                    os.environ.pop(k, None)
+                else:
+                    os.environ[k] = v
+    def _parse_omp_threads_bind_env(self):
+        vllm_mask = envs.VLLM_CPU_OMP_THREADS_BIND
+        self.skip_setup = vllm_mask == "nobind"
+        self.auto_setup = vllm_mask == "auto"
+        self.reserved_cpu_list = []
+        self.cpu_lists = []
+        if self.auto_setup:
+            # auto generate CPU lists
+            cpu_arch = current_platform.get_cpu_architecture()
+            if cpu_arch == CpuArchEnum.POWERPC:
+                # For POWERPC SMT-8/4/2
+                cpu_list, reserve_list = self._get_autobind_cpu_ids(
+                    lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]
+                )
+            elif cpu_arch in (CpuArchEnum.X86, CpuArchEnum.S390X):
+                # For x86/S390X SMT-2, use 1 logical CPU per physical core
+                cpu_list, reserve_list = self._get_autobind_cpu_ids(
+                    lambda cpus: cpus[-1:]
+                )
+            elif cpu_arch == CpuArchEnum.ARM:
+                # For AArch64, no SMT, use all logical CPU
+                cpu_list, reserve_list = self._get_autobind_cpu_ids(lambda cpus: cpus)
            else:
-                masks = [None]
+                cpu_list, reserve_list = [], []
-            if mock is None:
+                raise RuntimeError(f"{cpu_arch} doesn't support auto CPU binding.")
-                data = _get_cpu_topology_json()
+            for item in cpu_list:
+                self.cpu_lists.append([x.id for x in item])
+            self.reserved_cpu_list = [x.id for x in reserve_list]
+        elif not self.skip_setup:
+            # user defined CPU lists
+            omp_cpuids_list = vllm_mask.split("|")
+            if self.local_dp_rank is not None:
+                local_dp_rank = self.local_dp_rank
+                world_size = self.local_world_size
+                # Rank mapping [DP, PP, TP]
+                omp_cpuids_list = omp_cpuids_list[
+                    local_dp_rank * world_size : (local_dp_rank + 1) * world_size
+                ]
+            assert len(omp_cpuids_list) == self.local_world_size, (
+                "Given "
+                f"number of CPU id list {omp_cpuids_list} doesn't match "
+                f"local world size {self.local_world_size}."
+            )
+            # parse CPU list strings like "5,2-4" to [5, 2, 3, 4]
+            self.cpu_lists = [cr_utils.parse_id_list(s) for s in omp_cpuids_list]
+        else:
+            # skip
+            self.cpu_lists = []
+        msg = "OpenMP thread binding info: \n"
+        for i in range(self.local_world_size):
+            msg += f"\tlocal_rank={i}, core ids={self.cpu_lists[i]}\n"
+        msg += f"\treserved_cpus={self.reserved_cpu_list}"
+        logger.info(msg)
+    def _get_autobind_cpu_ids(
+        self, cpu_selector: Callable[[list[LogicalCPUInfo]], list[LogicalCPUInfo]]
+    ) -> tuple[list[list[LogicalCPUInfo]], list[LogicalCPUInfo]]:
+        """
+        Return CPU ids to bind based on NUMA nodes, and CPU ids reserved for
+        other processes.
+        Currently for rank N, only CPU ids on the N-th node in available NUMA
+        node list will be selected.
+        Args:
+            cpu_selector: a callable object to select CPUs from a CPU list
+            of a physical core. The input is a LogicalCPUInfo list contains
+            logical CPUs of a physical CPU, sorted by the LogicalCPUInfo.id.
+            A selected LogicalCPUInfo list should be returned.
+        """
+        # this memory node list has been sliced for DP offset
+        allowed_numa_nodes = cr_utils.get_visible_memory_node()
+        logical_cpu_list = cr_utils.get_allowed_cpu_list()
+        local_world_size = self.local_world_size
+        assert (
+            len(allowed_numa_nodes) >= local_world_size or self.simulate_multi_node
+        ), (
+            f"Not enough allowed NUMA nodes to bind threads of "
+            f"{local_world_size} local CPUWorkers. "
+            f"Allowed NUMA nodes are {allowed_numa_nodes}. "
+            "Please try to bind threads manually or decrease DP/TP/PP."
+        )
+        # Generate OMP CPU list for each rank
+        cpu_lists_of_ranks = []
+        reserved_cpu_list = []
+        total_cpu_num = 0
+        for local_rank in range(self.local_world_size):
+            if not self.simulate_multi_node:
+                selected_numa_node = allowed_numa_nodes[local_rank]
+                selected_logical_cpu_list = [
+                    x for x in logical_cpu_list if x.numa_node == selected_numa_node
+                ]
            else:
-                with open(mock, mode="rb") as jf:
+                world_size_across_dp = self.local_world_size * self.internal_dp_size
-                    data = jf.read()
+                assert len(logical_cpu_list) >= world_size_across_dp
-            lscpu = json.loads(data)
+                selected_logical_cpu_list = sorted(
-            for mask in masks:
+                    logical_cpu_list, key=lambda x: x.numa_node
-                resources = enumerate_resources(lscpu, mask, affinity)
+                )
-                omp_places.extend(create_omp_places(resources, strategy, smt))
+                sim_cpu_num_per_node = (
-            self.omp_places = sorted(
+                    len(selected_logical_cpu_list) // world_size_across_dp
-                omp_places,
+                )
-                key=lambda p: "{:04d}-{:04d}".format(len(p["mask"]), max(p["mask"])),
+                assert self.local_dp_rank is not None
-                reverse=True,
+                start_idx = (
+                    local_rank + self.local_world_size * self.local_dp_rank
+                ) * sim_cpu_num_per_node
+                selected_logical_cpu_list = selected_logical_cpu_list[
+                    start_idx : (start_idx + sim_cpu_num_per_node)
+                ]
+            # Select logical CPUs on same physical cores via cpu_selector
+            core_to_cpus: dict[int, list[LogicalCPUInfo]] = {}
+            for cpu_info in selected_logical_cpu_list:
+                if cpu_info.physical_core not in core_to_cpus:
+                    core_to_cpus[cpu_info.physical_core] = []
+                core_to_cpus[cpu_info.physical_core].append(cpu_info)
+            selected_logical_cpu_list = []
+            for cpu_list in core_to_cpus.values():
+                cpu_list = sorted(cpu_list, key=lambda x: x.id)
+                selected_logical_cpu_list.extend(cpu_selector(cpu_list))
+            # sort selected cores based on core id
+            selected_logical_cpu_list = sorted(
+                selected_logical_cpu_list, key=lambda x: x.id
            )
-    def run(self, what, *args, **kwargs):
+            cpu_lists_of_ranks.append(selected_logical_cpu_list)
-        """Run arg with correct OMP environment"""
+            total_cpu_num += len(selected_logical_cpu_list)
-        if self.setup_omp:
-            for place in self.omp_places:
+        # Reserve CPUs for other processes
-                if place["available"]:
+        if total_cpu_num <= self.reserve_cpu_num:
-                    reserve = int(os.environ.get("VLLM_CPU_NUM_OF_RESERVED_CPU", 0))
+            logger.warning(
-                    place["available"] = False
+                "Selected CPU core number (%s) "
-                    # pylint: disable=consider-using-f-string
+                "should be greater than reserved CPU core "
-                    os.environ["OMP_PLACES"] = "{}".format(place["mask"])
+                "number (%s).",
-                    os.environ["OMP_NUM_THREADS"] = "{}".format(
+                total_cpu_num,
-                        len(place["mask"]) - reserve
+                self.reserve_cpu_num,
-                    )
+            )
-                    os.environ["OMP_PROC_BIND"] = "TRUE"
+            return cpu_lists_of_ranks, []
-                    return what(*args, **kwargs)
-            raise IndexError("Out of OMP places")
+        reserve_num_per_rank = [
-        return what(*args, **kwargs)
+            self.reserve_cpu_num // self.local_world_size
+        ] * self.local_world_size
+        # last rank first
+        for i in range(
+            self.local_world_size - 1,
+            self.local_world_size - 1 - self.reserve_cpu_num % self.local_world_size,
+            -1,
+        ):
+            reserve_num_per_rank[i] += 1
+        for i in range(self.local_world_size):
+            num = reserve_num_per_rank[i]
+            if num > 0:
+                reserved_cpu_list.extend(cpu_lists_of_ranks[i][-num:])
+                cpu_lists_of_ranks[i] = cpu_lists_of_ranks[i][:-num]
+        return cpu_lists_of_ranks, reserved_cpu_list
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -51,6 +51,7 @@ from vllm.utils.network_utils import (
    get_loopback_ip,
    get_open_port,
 )
+from vllm.utils.ompmultiprocessing import OMPProcessManager
 from vllm.utils.system_utils import (
    _maybe_force_spawn,
    decorate_logs,
@@ -169,24 +170,14 @@ class MultiprocExecutor(Executor):
                [] if context.get_start_method() == "fork" else None
            )
+            # For CPU backend only, to setup OpenMP threads affinity
+            cpu_omp_manager = OMPProcessManager(self.vllm_config)
            for local_rank in range(self.local_world_size):
                global_rank = global_start_rank + local_rank
                is_driver_worker = self._is_driver_worker(global_rank)
-                if current_platform.is_cpu():
+                with cpu_omp_manager.configure_omp_envs(
-                    om = current_platform.get_omp_manager()
+                    rank=global_rank, local_rank=local_rank
-                    logger.info("Configured OMP PLACES %s", str(om.omp_places))
+                ):
-                    unready_worker_handle = om.run(
-                        WorkerProc.make_worker_process,
-                        vllm_config=self.vllm_config,
-                        local_rank=local_rank,
-                        rank=global_rank,
-                        distributed_init_method=distributed_init_method,
-                        input_shm_handle=scheduler_output_handle,
-                        shared_worker_lock=shared_worker_lock,
-                        is_driver_worker=is_driver_worker,
-                        inherited_fds=inherited_fds,
-                    )
-                else:
                    unready_worker_handle = WorkerProc.make_worker_process(
                        vllm_config=self.vllm_config,
                        local_rank=local_rank,

--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -116,21 +116,7 @@ class CPUModelRunner(GPUModelRunner):
        logger.info("Warming up model for the compilation...")
        # Only generate graph for the generic shape
        with _set_global_compilation_settings(self.vllm_config):
-            self._dummy_run(
+            self.profile_run()
-                min(
-                    max(16, self.max_num_reqs),
-                    self.scheduler_config.max_num_batched_tokens,
-                )
-            )
-        # Warm up drafter for speculative decoding
-        if self.speculative_config and (self.speculative_config.uses_draft_model()):
-            from vllm.v1.spec_decode.draft_model import DraftModelProposer
-            if isinstance(self.drafter, (DraftModelProposer)):
-                logger.info("Warming up drafter model...")
-                self.drafter.dummy_run(max(16, self.max_num_reqs))
        logger.info("Warming up done.")
    def initialize_kv_cache(

--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
 import os
 import sys
 from typing import Any
+import psutil
 import torch
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.profiler.wrapper import TorchProfilerWrapper
+from vllm.utils.cpu_resource_utils import (
+    get_allowed_cpu_list,
+    get_memory_node_info,
+    get_visible_memory_node,
+)
+from vllm.utils.mem_utils import format_gib
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.cpu_model_runner import CPUModelRunner
 from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment
@@ -27,6 +35,46 @@ class CPUWorker(Worker):
        distributed_init_method: str,
        is_driver_worker: bool = False,
    ):
+        # TODO: use numactl for process setup
+        # TODO: optimize for `interleaved` policy
+        # Bind memory node
+        allowed_memory_nodes = get_visible_memory_node()
+        allowed_cpu_list = get_allowed_cpu_list()
+        cpu_core = allowed_cpu_list[0]
+        # TODO: some CI hosts are not correctly set, change to assertion
+        # after fix
+        if cpu_core.numa_node not in allowed_memory_nodes:
+            logger.warning(
+                "Node %s is not in available memory nodes %s.",
+                cpu_core.numa_node,
+                allowed_memory_nodes,
+            )
+        torch.ops._C.init_cpu_memory_env([cpu_core.numa_node])
+        memory_status = get_memory_node_info(cpu_core.numa_node)
+        memory_fraction = vllm_config.cache_config.gpu_memory_utilization
+        self.requested_cpu_memory = math.ceil(
+            memory_status.total_memory * memory_fraction
+        )
+        available_memory = memory_status.available_memory
+        if (
+            vllm_config.cache_config.kv_cache_memory_bytes is None
+            and self.requested_cpu_memory > available_memory
+        ):
+            raise ValueError(
+                f"Available memory on node {cpu_core.numa_node} "
+                f"({format_gib(available_memory)}/"
+                f"{format_gib(memory_status.total_memory)} GiB) on startup "
+                f"is less than desired CPU memory utilization "
+                f"({vllm_config.cache_config.gpu_memory_utilization}, "
+                f"{format_gib(self.requested_cpu_memory)} GiB). "
+                "Decrease --gpu-memory-utilization"
+                f" or reduce CPU memory used by other processes."
+            )
        super().__init__(
            vllm_config,
            local_rank,
@@ -103,13 +151,69 @@ class CPUWorker(Worker):
        pass
    def determine_available_memory(self) -> int:
-        return self.cache_config.cpu_kvcache_space_bytes or 0
+        self.model_runner.warming_up_model()
+        allowed_cpu_list = get_allowed_cpu_list()
+        cpu_core = allowed_cpu_list[0]
+        memory_status = get_memory_node_info(cpu_core.numa_node)
+        available_memory = memory_status.available_memory
+        explicit_kv_cache_size = self.cache_config.kv_cache_memory_bytes
+        kv_cache_size = None
+        msg = None
+        if explicit_kv_cache_size is not None:
+            if explicit_kv_cache_size > available_memory:
+                raise ValueError(
+                    f"Available memory on node {cpu_core.numa_node} "
+                    f"({format_gib(available_memory)}/"
+                    f"{format_gib(memory_status.total_memory)} GiB) on kv cache"
+                    f" allocation is less than requested memory for kv "
+                    f"({format_gib(explicit_kv_cache_size)} GiB). "
+                    "Decrease --kv-cache-memory-bytes, VLLM_CPU_KVCACHE_SPACE, "
+                    "or reduce CPU memory used by other processes."
+                )
+            kv_cache_size = explicit_kv_cache_size
+            msg = (
+                f"Explicitly set ({format_gib(kv_cache_size)}/"
+                f"{format_gib(memory_status.total_memory)}) GiB for KV cache "
+                f"on node {cpu_core.numa_node}."
+            )
+        else:
+            consumed_memory = psutil.Process(os.getpid()).memory_info().rss
+            requested_memory_for_kv = int(self.requested_cpu_memory - consumed_memory)
+            if (
+                requested_memory_for_kv <= 0
+                or requested_memory_for_kv > available_memory
+            ):
+                raise ValueError(
+                    f"Available memory on node {cpu_core.numa_node} "
+                    f"({format_gib(available_memory)}/"
+                    f"{format_gib(memory_status.total_memory)} GiB) on kv cache"
+                    f" allocation is less than requested memory for kv "
+                    f"({format_gib(requested_memory_for_kv)}/"
+                    f"{format_gib(self.requested_cpu_memory)} GiB). "
+                    "Reduce CPU memory used by other processes."
+                )
+            kv_cache_size = requested_memory_for_kv
+            msg = (
+                f"Auto set ({format_gib(kv_cache_size)}/"
+                f"{format_gib(memory_status.total_memory)}) GiB for KV cache "
+                f"on node {cpu_core.numa_node}, with "
+                f"{format_gib(self.requested_cpu_memory)} GiB requested memory"
+                f" for the worker. {format_gib(consumed_memory)} GiB"
+                f" memory was consumed by non-kv usages."
+            )
+        logger.info(msg)
+        return kv_cache_size
    def compile_or_warm_up_model(self) -> CompilationTimes:
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)
-        self.model_runner.warming_up_model()
+        # Note: the model has been compiled in determine_available_memory()
        return CompilationTimes(
            language_model=self.compilation_config.compilation_time,
            encoder=self.compilation_config.encoder_compilation_time,