Unverified Commit 092ace9e authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[UX] Improve UX of CPU backend (#36968)


Signed-off-by: default avatarjiang1.li <jiang1.li@intel.com>
Signed-off-by: default avatarLi, Jiang <bigpyj64@gmail.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent f680dc1b
...@@ -21,6 +21,20 @@ steps: ...@@ -21,6 +21,20 @@ steps:
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py" pytest -x -v -s tests/kernels/test_onednn.py"
- label: CPU-Compatibility Tests
depends_on: []
soft_fail: true
device: intel_cpu
no_plugin: true
source_file_dependencies:
- cmake/cpu_extension.cmake
- setup.py
- vllm/platforms/cpu.py
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
bash .buildkite/scripts/hardware_ci/run-cpu-compatibility-test.sh"
- label: CPU-Language Generation and Pooling Model Tests - label: CPU-Language Generation and Pooling Model Tests
depends_on: [] depends_on: []
soft_fail: true soft_fail: true
......
...@@ -25,9 +25,7 @@ fi ...@@ -25,9 +25,7 @@ fi
docker build --file docker/Dockerfile.cpu \ docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \ --build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \ --build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg VLLM_CPU_AVX512BF16=true \ --build-arg VLLM_CPU_X86=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--build-arg VLLM_CPU_AMXBF16=true \
--tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \ --tag "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-cpu \
--target vllm-test \ --target vllm-test \
--progress plain . --progress plain .
......
...@@ -83,7 +83,7 @@ steps: ...@@ -83,7 +83,7 @@ steps:
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35" - "bash .buildkite/scripts/upload-nightly-wheels.sh manylinux_2_35"
...@@ -152,7 +152,7 @@ steps: ...@@ -152,7 +152,7 @@ steps:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env: env:
......
#!/bin/bash
set -euox pipefail
export VLLM_CPU_KVCACHE_SPACE=1
export VLLM_CPU_CI_ENV=1
# Reduce sub-processes for acceleration
export TORCH_COMPILE_DISABLE=1
export VLLM_ENABLE_V1_MULTIPROCESSING=0
SDE_ARCHIVE="sde-external-10.7.0-2026-02-18-lin.tar.xz"
SDE_CHECKSUM="CA3D4086DE4ACB3FAEDF9F57B541C6936B7D5E19AE2BF763B6EA933573A0A217"
wget "https://downloadmirror.intel.com/913594/${SDE_ARCHIVE}"
echo "${SDE_CHECKSUM} ${SDE_ARCHIVE}" | sha256sum --check
mkdir -p sde
tar -xvf "./${SDE_ARCHIVE}" --strip-components=1 -C ./sde/
wait_for_pid_and_check_log() {
local pid="$1"
local log_file="$2"
local exit_status
if [ -z "$pid" ] || [ -z "$log_file" ]; then
echo "Usage: wait_for_pid_and_check_log <PID> <LOG_FILE>"
return 1
fi
echo "Waiting for process $pid to finish..."
# Use the 'wait' command to pause the script until the specific PID exits.
# The 'wait' command's own exit status will be that of the waited-for process.
if wait "$pid"; then
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Success)."
else
exit_status=$?
echo "Process $pid finished with exit status $exit_status (Failure)."
fi
if [ "$exit_status" -ne 0 ]; then
echo "Process exited with a non-zero status."
echo "--- Last few lines of log file: $log_file ---"
tail -n 50 "$log_file"
echo "---------------------------------------------"
return 1 # Indicate failure based on exit status
fi
echo "No errors detected in log file and process exited successfully."
return 0
}
# Test Sky Lake (AVX512F)
./sde/sde64 -skl -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_0.log 2>&1 &
PID_TEST_0=$!
# Test Cascade Lake (AVX512F + VNNI)
./sde/sde64 -clx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_1.log 2>&1 &
PID_TEST_1=$!
# Test Cooper Lake (AVX512F + VNNI + BF16)
./sde/sde64 -cpx -- python3 examples/basic/offline_inference/generate.py --model facebook/opt-125m --dtype bfloat16 > test_2.log 2>&1 &
PID_TEST_2=$!
wait_for_pid_and_check_log $PID_TEST_0 test_0.log
wait_for_pid_and_check_log $PID_TEST_1 test_1.log
wait_for_pid_and_check_log $PID_TEST_2 test_2.log
...@@ -102,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA) ...@@ -102,11 +102,13 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64" OR ENABLE_X86_ISA)
"-mavx512f" "-mavx512f"
"-mavx512vl" "-mavx512vl"
"-mavx512bw" "-mavx512bw"
"-mavx512dq" "-mavx512dq")
"-mavx512bf16" list(APPEND CXX_COMPILE_FLAGS_AVX512_AMX
"-mavx512vnni" ${CXX_COMPILE_FLAGS_AVX512}
"-mamx-bf16" "-mamx-bf16"
"-mamx-tile") "-mamx-tile"
"-mavx512bf16"
"-mavx512vnni")
list(APPEND CXX_COMPILE_FLAGS_AVX2 list(APPEND CXX_COMPILE_FLAGS_AVX2
"-mavx2") "-mavx2")
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
...@@ -314,7 +316,8 @@ endif() ...@@ -314,7 +316,8 @@ endif()
# TODO: Refactor this # TODO: Refactor this
if (ENABLE_X86_ISA) if (ENABLE_X86_ISA)
message(STATUS "CPU extension (AVX512) compile flags: ${CXX_COMPILE_FLAGS_AVX512}") message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) compile flags: ${CXX_COMPILE_FLAGS_AVX512_AMX}")
message(STATUS "CPU extension (AVX512F) compile flags: ${CXX_COMPILE_FLAGS_AVX512}")
message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}") message(STATUS "CPU extension (AVX2) compile flags: ${CXX_COMPILE_FLAGS_AVX2}")
else() else()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
...@@ -366,13 +369,15 @@ if(USE_ONEDNN) ...@@ -366,13 +369,15 @@ if(USE_ONEDNN)
endif() endif()
if (ENABLE_X86_ISA) if (ENABLE_X86_ISA)
set(VLLM_EXT_SRC_AVX512 set(VLLM_EXT_SRC_SGL
"csrc/cpu/sgl-kernels/gemm.cpp" "csrc/cpu/sgl-kernels/gemm.cpp"
"csrc/cpu/sgl-kernels/gemm_int8.cpp" "csrc/cpu/sgl-kernels/gemm_int8.cpp"
"csrc/cpu/sgl-kernels/gemm_fp8.cpp" "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
"csrc/cpu/sgl-kernels/moe.cpp" "csrc/cpu/sgl-kernels/moe.cpp"
"csrc/cpu/sgl-kernels/moe_int8.cpp" "csrc/cpu/sgl-kernels/moe_int8.cpp"
"csrc/cpu/sgl-kernels/moe_fp8.cpp" "csrc/cpu/sgl-kernels/moe_fp8.cpp")
set(VLLM_EXT_SRC_AVX512
"csrc/cpu/shm.cpp" "csrc/cpu/shm.cpp"
"csrc/cpu/cpu_wna16.cpp" "csrc/cpu/cpu_wna16.cpp"
"csrc/cpu/cpu_fused_moe.cpp" "csrc/cpu/cpu_fused_moe.cpp"
...@@ -398,31 +403,48 @@ if (ENABLE_X86_ISA) ...@@ -398,31 +403,48 @@ if (ENABLE_X86_ISA)
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp") "csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
message(STATUS "CPU extension (AVX512) source files: ${VLLM_EXT_SRC_AVX512}") message(STATUS "CPU extension (AVX512F + BF16 + VNNI + AMX) source files: ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}")
message(STATUS "CPU extension (AVX512F) source files: ${VLLM_EXT_SRC_AVX512}")
message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}") message(STATUS "CPU extension (AVX2) source files: ${VLLM_EXT_SRC_AVX2}")
set(_C_LIBS numa dnnl_ext)
set(_C_AVX512_LIBS numa dnnl_ext)
set(_C_AVX2_LIBS numa)
# AMX + AVX512F + AVX512BF16 + AVX512VNNI
define_extension_target( define_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
LANGUAGE CXX LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX512} SOURCES ${VLLM_EXT_SRC_AVX512} ${VLLM_EXT_SRC_SGL}
LIBRARIES ${LIBS} LIBRARIES ${_C_LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512} COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512_AMX}
USE_SABI 3 USE_SABI 3
WITH_SOABI WITH_SOABI
) )
# For SGL kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AVX512")
# For AMX kernels # For AMX kernels
target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16") target_compile_definitions(_C PRIVATE "-DCPU_CAPABILITY_AMXBF16")
# AVX512F
define_extension_target(
_C_AVX512
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX512}
LIBRARIES ${_C_AVX512_LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512}
USE_SABI 3
WITH_SOABI
)
# AVX2
define_extension_target( define_extension_target(
_C_AVX2 _C_AVX2
DESTINATION vllm DESTINATION vllm
LANGUAGE CXX LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC_AVX2} SOURCES ${VLLM_EXT_SRC_AVX2}
LIBRARIES ${LIBS} LIBRARIES ${_C_AVX2_LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2} COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX2}
USE_SABI 3 USE_SABI 3
WITH_SOABI WITH_SOABI
......
...@@ -14,12 +14,7 @@ ...@@ -14,12 +14,7 @@
# #
# Build arguments: # Build arguments:
# PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10 # PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
# VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_X86=false (default)|true (for cross-compilation)
# VLLM_CPU_AVX2=false (default)|true (for cross-compilation)
# VLLM_CPU_AVX512=false (default)|true (for cross-compilation)
# VLLM_CPU_AVX512BF16=false (default)|true (for cross-compilation)
# VLLM_CPU_AVX512VNNI=false (default)|true (for cross-compilation)
# VLLM_CPU_AMXBF16=false (default)|true (for cross-compilation)
# VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation) # VLLM_CPU_ARM_BF16=false (default)|true (for cross-compilation)
# #
...@@ -36,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -36,7 +31,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \ apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \ && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof xz-utils \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh && curl -LsSf https://astral.sh/uv/install.sh | sh
...@@ -91,24 +86,9 @@ ARG max_jobs=32 ...@@ -91,24 +86,9 @@ ARG max_jobs=32
ENV MAX_JOBS=${max_jobs} ENV MAX_JOBS=${max_jobs}
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... # Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
ARG VLLM_CPU_DISABLE_AVX512=0 ARG VLLM_CPU_X86=0
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} ENV VLLM_CPU_X86=${VLLM_CPU_X86}
# Support for cross-compilation with AVX2 ISA: docker build --build-arg VLLM_CPU_AVX2="1" ...
ARG VLLM_CPU_AVX2=0
ENV VLLM_CPU_AVX2=${VLLM_CPU_AVX2}
# Support for cross-compilation with AVX512 ISA: docker build --build-arg VLLM_CPU_AVX512="1" ...
ARG VLLM_CPU_AVX512=0
ENV VLLM_CPU_AVX512=${VLLM_CPU_AVX512}
# Support for building with AVX512BF16 ISA: docker build --build-arg VLLM_CPU_AVX512BF16="true" ...
ARG VLLM_CPU_AVX512BF16=0
ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=1
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
# Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ... # Support for cross-compilation with ARM BF16 ISA: docker build --build-arg VLLM_CPU_ARM_BF16="true" ...
ARG VLLM_CPU_ARM_BF16=0 ARG VLLM_CPU_ARM_BF16=0
ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16} ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
...@@ -116,7 +96,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16} ...@@ -116,7 +96,7 @@ ENV VLLM_CPU_ARM_BF16=${VLLM_CPU_ARM_BF16}
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
# Validate build arguments - prevent mixing incompatible ISA flags # Validate build arguments - prevent mixing incompatible ISA flags
RUN if [ "$TARGETARCH" = "arm64" ] && { [ "$VLLM_CPU_AVX2" != "0" ] || [ "$VLLM_CPU_AVX512" != "0" ] || [ "$VLLM_CPU_AVX512BF16" != "0" ] || [ "$VLLM_CPU_AVX512VNNI" != "0" ]; }; then \ RUN if [ "$TARGETARCH" = "arm64" ] && [ "$VLLM_CPU_X86" != "0" ]; then \
echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \ echo "ERROR: Cannot use x86-specific ISA flags (AVX2, AVX512, etc.) when building for ARM64 (--platform=linux/arm64)"; \
exit 1; \ exit 1; \
fi && \ fi && \
...@@ -174,7 +154,7 @@ WORKDIR /vllm-workspace ...@@ -174,7 +154,7 @@ WORKDIR /vllm-workspace
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl xz-utils make clangd-14 apt-get install -y --no-install-recommends vim numactl make clangd-14
RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd RUN ln -s /usr/bin/clangd-14 /usr/bin/clangd
...@@ -232,22 +212,12 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" ...@@ -232,22 +212,12 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm"
# Build configuration labels # Build configuration labels
ARG TARGETARCH ARG TARGETARCH
ARG VLLM_CPU_DISABLE_AVX512 ARG VLLM_CPU_X86
ARG VLLM_CPU_AVX2
ARG VLLM_CPU_AVX512
ARG VLLM_CPU_AVX512BF16
ARG VLLM_CPU_AVX512VNNI
ARG VLLM_CPU_AMXBF16
ARG VLLM_CPU_ARM_BF16 ARG VLLM_CPU_ARM_BF16
ARG PYTHON_VERSION ARG PYTHON_VERSION
LABEL ai.vllm.build.target-arch="${TARGETARCH}" LABEL ai.vllm.build.target-arch="${TARGETARCH}"
LABEL ai.vllm.build.cpu-disable-avx512="${VLLM_CPU_DISABLE_AVX512:-false}" LABEL ai.vllm.build.cpu-x86="${VLLM_CPU_X86:-false}"
LABEL ai.vllm.build.cpu-avx2="${VLLM_CPU_AVX2:-false}"
LABEL ai.vllm.build.cpu-avx512="${VLLM_CPU_AVX512:-false}"
LABEL ai.vllm.build.cpu-avx512bf16="${VLLM_CPU_AVX512BF16:-false}"
LABEL ai.vllm.build.cpu-avx512vnni="${VLLM_CPU_AVX512VNNI:-false}"
LABEL ai.vllm.build.cpu-amxbf16="${VLLM_CPU_AMXBF16:-false}"
LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}" LABEL ai.vllm.build.cpu-arm-bf16="${VLLM_CPU_ARM_BF16:-false}"
LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}" LABEL ai.vllm.build.python-version="${PYTHON_VERSION:-3.12}"
......
...@@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data ...@@ -7,7 +7,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
--8<-- [start:requirements] --8<-- [start:requirements]
- OS: Linux - OS: Linux
- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional) - CPU flags: `avx512f` (Recommended), `avx2` (Limited features)
!!! tip !!! tip
Use `lscpu` to check the CPU flags. Use `lscpu` to check the CPU flags.
...@@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data ...@@ -18,7 +18,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
--8<-- [end:set-up-using-python] --8<-- [end:set-up-using-python]
--8<-- [start:pre-built-wheels] --8<-- [start:pre-built-wheels]
Pre-built vLLM wheels for x86 with AVX512 are available since version 0.13.0. To install release wheels: Pre-built vLLM wheels for x86 with AVX512/AVX2 are available since version 0.17.0. To install release wheels:
```bash ```bash
export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//') export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
...@@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation ...@@ -108,13 +108,13 @@ VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
If you want to develop vLLM, install it in editable mode instead. If you want to develop vLLM, install it in editable mode instead.
```bash ```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation VLLM_TARGET_DEVICE=cpu python3 setup.py develop
``` ```
Optionally, build a portable wheel which you can then install elsewhere: Optionally, build a portable wheel which you can then install elsewhere:
```bash ```bash
VLLM_TARGET_DEVICE=cpu uv build --wheel VLLM_TARGET_DEVICE=cpu uv build --wheel --no-build-isolation
``` ```
```bash ```bash
...@@ -185,12 +185,9 @@ docker run \ ...@@ -185,12 +185,9 @@ docker run \
-v ~/.cache/huggingface:/root/.cache/huggingface \ -v ~/.cache/huggingface:/root/.cache/huggingface \
-p 8000:8000 \ -p 8000:8000 \
--env "HF_TOKEN=<secret>" \ --env "HF_TOKEN=<secret>" \
vllm/vllm-openai-cpu:latest-x86_64 <args...> vllm/vllm-openai-cpu:latest-x86_64 <args...>
``` ```
!!! warning
If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. See the build-image-from-source section below for build arguments to match your target CPU capabilities.
--8<-- [end:pre-built-images] --8<-- [end:pre-built-images]
--8<-- [start:build-image-from-source] --8<-- [start:build-image-from-source]
...@@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 <args...> ...@@ -198,50 +195,11 @@ vllm/vllm-openai-cpu:latest-x86_64 <args...>
```bash ```bash
docker build -f docker/Dockerfile.cpu \ docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_DISABLE_AVX512=<false (default)|true> \ --build-arg VLLM_CPU_X86=<false (default)|true> \ # For cross-compilation
--build-arg VLLM_CPU_AVX2=<false (default)|true> \
--build-arg VLLM_CPU_AVX512=<false (default)|true> \
--build-arg VLLM_CPU_AVX512BF16=<false (default)|true> \
--build-arg VLLM_CPU_AVX512VNNI=<false (default)|true> \
--build-arg VLLM_CPU_AMXBF16=<false|true (default)> \
--tag vllm-cpu-env \ --tag vllm-cpu-env \
--target vllm-openai . --target vllm-openai .
``` ```
!!! note "Auto-detection by default"
By default, CPU instruction sets (AVX512, AVX2, etc.) are automatically detected from the build system's CPU flags. Build arguments like `VLLM_CPU_AVX2`, `VLLM_CPU_AVX512`, `VLLM_CPU_AVX512BF16`, `VLLM_CPU_AVX512VNNI`, and `VLLM_CPU_AMXBF16` are used for cross-compilation:
- `VLLM_CPU_{ISA}=true` - Force-enable the instruction set (build with ISA regardless of build system capabilities)
- `VLLM_CPU_{ISA}=false` - Rely on auto-detection (default)
##### Examples
###### Auto-detection build (default)
```bash
docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
```
###### Cross-compile for AVX512
```bash
docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_AVX512=true \
--build-arg VLLM_CPU_AVX512BF16=true \
--build-arg VLLM_CPU_AVX512VNNI=true \
--tag vllm-cpu-avx512 \
--target vllm-openai .
```
###### Cross-compile for AVX2
```bash
docker build -f docker/Dockerfile.cpu \
--build-arg VLLM_CPU_AVX2=true \
--tag vllm-cpu-avx2 \
--target vllm-openai .
```
#### Launching the OpenAI server #### Launching the OpenAI server
```bash ```bash
......
...@@ -920,6 +920,7 @@ if _is_cpu(): ...@@ -920,6 +920,7 @@ if _is_cpu():
if platform.machine() in ("x86_64", "AMD64"): if platform.machine() in ("x86_64", "AMD64"):
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
ext_modules.append(CMakeExtension(name="vllm._C_AVX512"))
ext_modules.append(CMakeExtension(name="vllm._C_AVX2")) ext_modules.append(CMakeExtension(name="vllm._C_AVX2"))
else: else:
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
......
...@@ -252,6 +252,8 @@ class CpuPlatform(Platform): ...@@ -252,6 +252,8 @@ class CpuPlatform(Platform):
if vllm_config.lora_config is not None: if vllm_config.lora_config is not None:
compilation_config.mode = CompilationMode.NONE compilation_config.mode = CompilationMode.NONE
vllm_config.profiler_config.torch_profiler_dump_cuda_time_total = False
assert vllm_config.device_config.device_type == "cpu" assert vllm_config.device_config.device_type == "cpu"
# #
...@@ -470,20 +472,31 @@ class CpuPlatform(Platform): ...@@ -470,20 +472,31 @@ class CpuPlatform(Platform):
@classmethod @classmethod
def import_kernels(cls) -> None: def import_kernels(cls) -> None:
if Platform.get_cpu_architecture() in (CpuArchEnum.X86,): if Platform.get_cpu_architecture() in (CpuArchEnum.X86,):
if torch._C._cpu._is_avx512_supported(): # Note: The lib name is _C_AVX2/AVX512, but the module name is _C.
# This will cause a exception "dynamic module does define
# module export function". But the library is imported
# successfully. So ignore the exception for now, until we find
# a solution.
ignored_msg = "dynamic module does not define module export function"
if torch.cpu._is_avx512_supported():
if torch.cpu._is_avx512_bf16_supported():
try: try:
import vllm._C # noqa: F401 import vllm._C # noqa: F401
except ImportError as e: except ImportError as e:
logger.warning("Failed to import from vllm._C: %r", e) logger.warning("Failed to import from vllm._C: %r", e)
else: else:
# Note: The lib name is _C_AVX2, but the module name is _C. try:
# This will cause a exception "dynamic module does define import vllm._C_AVX512 # noqa: F401
# module export function". But the library is imported except ImportError as e:
# successfully. So ignore the exception for now, until we find if ignored_msg not in e.msg:
# a solution. logger.warning(
"Failed to import from vllm._C_AVX512: %r", e
)
else:
try: try:
import vllm._C_AVX2 # noqa: F401 import vllm._C_AVX2 # noqa: F401
except ImportError as e: except ImportError as e:
if ignored_msg not in e.msg:
logger.warning("Failed to import from vllm._C_AVX2: %r", e) logger.warning("Failed to import from vllm._C_AVX2: %r", e)
else: else:
try: try:
......
...@@ -52,6 +52,21 @@ class CPUWorker(Worker): ...@@ -52,6 +52,21 @@ class CPUWorker(Worker):
) )
def init_device(self): def init_device(self):
# Check whether critical libraries are loaded
def check_preloaded_libs(name: str):
ld_preload_list = os.environ.get("LD_PRELOAD", "")
if name not in ld_preload_list:
raise RuntimeError(
f"{name} is not found in LD_PRELOAD. "
"Please follow the section `set LD_PRELOAD` in "
"https://docs.vllm.ai/en/latest/getting_started/installation/cpu/ "
"to setup required pre-loaded libraries."
)
check_preloaded_libs("libtcmalloc")
if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
check_preloaded_libs("libiomp")
# Setup OpenMP threads affinity. # Setup OpenMP threads affinity.
omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND
# Under numa binding some cores reserved for kv transfer in nixl_connector.py # Under numa binding some cores reserved for kv transfer in nixl_connector.py
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment