Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
07daee13
Unverified
Commit
07daee13
authored
Feb 05, 2026
by
Li, Jiang
Committed by
GitHub
Feb 05, 2026
Browse files
[CI/Build] Parallelize CPU CI tests (#33778)
Signed-off-by:
jiang1.li
<
jiang1.li@intel.com
>
parent
9595afda
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
157 additions
and
130 deletions
+157
-130
.buildkite/hardware_tests/arm.yaml
.buildkite/hardware_tests/arm.yaml
+0
-8
.buildkite/hardware_tests/cpu.yaml
.buildkite/hardware_tests/cpu.yaml
+100
-0
.buildkite/hardware_tests/intel.yaml
.buildkite/hardware_tests/intel.yaml
+0
-7
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
...ite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
+26
-0
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+9
-109
vllm/v1/worker/cpu_worker.py
vllm/v1/worker/cpu_worker.py
+22
-6
No files found.
.buildkite/hardware_tests/arm.yaml
deleted
100644 → 0
View file @
9595afda
group
:
Hardware
steps
:
-
label
:
"
Arm
CPU
Test"
soft_fail
:
true
device
:
arm_cpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/hardware_tests/cpu.yaml
0 → 100644
View file @
07daee13
group
:
CPU
depends_on
:
[]
steps
:
-
label
:
CPU-Kernel Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
csrc/cpu/
-
cmake/cpu_extension.cmake
-
CMakeLists.txt
-
vllm/_custom_ops.py
-
tests/kernels/attention/test_cpu_attn.py
-
tests/kernels/moe/test_cpu_fused_moe.py
-
tests/kernels/test_onednn.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
-
label
:
CPU-Language Generation and Pooling Model Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
csrc/cpu/
-
vllm/
-
tests/models/language/generation/
-
tests/models/language/pooling/
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
-
label
:
CPU-Quantization Model Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
csrc/cpu/
-
vllm/model_executor/layers/quantization/cpu_wna16.py
-
vllm/model_executor/layers/quantization/gptq_marlin.py
-
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
-
vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
-
vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
-
tests/quantization/test_compressed_tensors.py
-
tests/quantization/test_cpu_wna16.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs
pytest -x -v -s tests/quantization/test_cpu_wna16.py"
-
label
:
CPU-TP/DP/PP Tests
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
-
csrc/cpu/shm.cpp
-
vllm/v1/worker/cpu_worker.py
-
vllm/v1/worker/gpu_worker.py
-
vllm/v1/worker/cpu_model_runner.py
-
vllm/v1/worker/gpu_model_runner.py
-
vllm/platforms/cpu.py
-
vllm/distributed/parallel_state.py
-
vllm/distributed/device_communicators/cpu_communicator.py
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 10m "
bash .buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh"
-
label
:
CPU-Multi-Modal Model Tests %N
depends_on
:
[]
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
source_file_dependencies
:
# - vllm/
-
vllm/model_executor/layers/rotary_embedding
-
tests/models/multimodal/generation/
commands
:
-
|
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 45m "
pytest -x -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_pixtral.py -m cpu_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB"
parallelism
:
2
-
label
:
"
Arm
CPU
Test"
depends_on
:
[]
soft_fail
:
true
device
:
arm_cpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
.buildkite/hardware_tests/intel.yaml
View file @
07daee13
group
:
Hardware
depends_on
:
~
steps
:
-
label
:
"
Intel
CPU
Test"
soft_fail
:
true
device
:
intel_cpu
no_plugin
:
true
commands
:
-
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh
-
label
:
"
Intel
HPU
Test"
soft_fail
:
true
device
:
intel_hpu
...
...
.buildkite/scripts/hardware_ci/run-cpu-distributed-smoke-test.sh
0 → 100644
View file @
07daee13
#!/bin/bash
set
-euox
pipefail
echo
"--- PP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-pp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
echo
"--- DP+TP"
vllm serve meta-llama/Llama-3.2-3B-Instruct
-tp
=
2
-dp
=
2 &
server_pid
=
$!
timeout
600 bash
-c
"until curl localhost:8000/v1/models; do sleep 1; done"
||
exit
1
vllm bench serve
\
--backend
vllm
\
--dataset-name
random
\
--model
meta-llama/Llama-3.2-3B-Instruct
\
--num-prompts
20
\
--endpoint
/v1/completions
kill
-s
SIGTERM
$server_pid
&
.buildkite/scripts/hardware_ci/run-cpu-test.sh
View file @
07daee13
...
...
@@ -2,119 +2,19 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-e
x
set
-e
uox
pipefail
# allow to bind to different cores
CORE_RANGE
=
${
CORE_RANGE
:-
48
-95
}
# used for TP/PP E2E test
OMP_CORE_RANGE
=
${
OMP_CORE_RANGE
:-
48
-95
}
NUMA_NODE
=
${
NUMA_NODE
:-
1
}
IMAGE_NAME
=
"cpu-test-
$NUMA_NODE
"
TIMEOUT_VAL
=
$1
TEST_COMMAND
=
$2
export
CMAKE_BUILD_PARALLEL_LEVEL
=
32
# Setup cleanup
remove_docker_container
()
{
set
-e
;
docker
rm
-f
cpu-test-
"
$NUMA_NODE
"
cpu-test-
"
$NUMA_NODE
"
-avx2
||
true
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
--progress
plain
--tag
cpu-test-
"
$NUMA_NODE
"
--target
vllm-test
-f
docker/Dockerfile.cpu
.
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
--progress
plain
--build-arg
VLLM_CPU_DISABLE_AVX512
=
"true"
--tag
cpu-test-
"
$NUMA_NODE
"
-avx2
--target
vllm-test
-f
docker/Dockerfile.cpu
.
# building the docker image
echo
"--- :docker: Building Docker image"
docker build
--progress
plain
--tag
"
$IMAGE_NAME
"
--target
vllm-test
-f
docker/Dockerfile.cpu
.
# Run the image, setting --shm-size=4g for tensor parallel.
docker run
-itd
--cpuset-cpus
=
"
$CORE_RANGE
"
--cpuset-mems
=
"
$NUMA_NODE
"
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
16
--env
VLLM_CPU_CI_ENV
=
1
-e
E2E_OMP_THREADS
=
"
$OMP_CORE_RANGE
"
--shm-size
=
4g
--name
cpu-test-
"
$NUMA_NODE
"
cpu-test-
"
$NUMA_NODE
"
docker run
-itd
--cpuset-cpus
=
"
$CORE_RANGE
"
--cpuset-mems
=
"
$NUMA_NODE
"
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
16
--env
VLLM_CPU_CI_ENV
=
1
-e
E2E_OMP_THREADS
=
"
$OMP_CORE_RANGE
"
--shm-size
=
4g
--name
cpu-test-
"
$NUMA_NODE
"
-avx2
cpu-test-
"
$NUMA_NODE
"
-avx2
function
cpu_tests
()
{
set
-e
export
NUMA_NODE
=
$2
# list packages
docker
exec
cpu-test-
"
$NUMA_NODE
"
-avx2
bash
-c
"
set -e
pip list"
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pip list"
# offline inference
docker
exec
cpu-test-
"
$NUMA_NODE
"
-avx2
bash
-c
"
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run kernel tests
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
# Note: disable until supports V1
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -x -v -s tests/models/language/generation -m cpu_model
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -x -v -s tests/models/multimodal/generation
\
--ignore=tests/models/multimodal/generation/test_pixtral.py
\
-m cpu_model"
# Run compressed-tensor test
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pytest -x -s -v
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Run AWQ/GPTQ test
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pytest -x -s -v
\
tests/quantization/test_cpu_wna16.py"
# Run multi-lora tests
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
"
set -e
pytest -x -s -v
\
tests/lora/test_qwenvl.py"
# online serving: tp+pp
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
'
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
# online serving: tp+dp
docker
exec
cpu-test-
"
$NUMA_NODE
"
bash
-c
'
set -e
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
server_pid=$!
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
vllm bench serve \
--backend vllm \
--dataset-name random \
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions
kill -s SIGTERM $server_pid &'
}
# All of CPU tests are expected to be finished less than 40 mins.
export
-f
cpu_tests
timeout
2.5h bash
-c
"cpu_tests
$CORE_RANGE
$NUMA_NODE
"
docker run
--rm
--cpuset-cpus
=
$CORE_RANGE
--cpuset-mems
=
$NUMA_NODE
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
-e
HF_TOKEN
-e
VLLM_CPU_KVCACHE_SPACE
=
16
-e
VLLM_CPU_CI_ENV
=
1
-e
VLLM_CPU_SIM_MULTI_NUMA
=
1
--shm-size
=
4g
$IMAGE_NAME
\
timeout
$TIMEOUT_VAL
bash
-c
"set -euox pipefail; echo
\"
--- Print packages
\"
; pip list; echo
\"
--- Running tests
\"
;
${
TEST_COMMAND
}
"
vllm/v1/worker/cpu_worker.py
View file @
07daee13
...
...
@@ -136,22 +136,38 @@ class CPUWorker(Worker):
the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be
returned.
"""
# simulate multiple numa nodes, for testing
sim_multi_numa_nodes
=
os
.
environ
.
get
(
"VLLM_CPU_SIM_MULTI_NUMA"
,
"0"
)
!=
"0"
allowed_numa_nodes
,
logical_cpu_list
=
(
CpuPlatform
.
get_allowed_cpu_core_node_list
()
)
assert
len
(
allowed_numa_nodes
)
>=
self
.
parallel_config
.
world_size
,
(
assert
(
len
(
allowed_numa_nodes
)
>=
self
.
parallel_config
.
world_size
or
sim_multi_numa_nodes
),
(
f
"Not enough allowed NUMA nodes to bind threads of "
f
"
{
self
.
parallel_config
.
world_size
}
CPUWorkers. "
f
"Allowed NUMA nodes are
{
allowed_numa_nodes
}
. "
"Please try to bind threads manually."
)
if
not
sim_multi_numa_nodes
:
# Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`
selected_numa_node
=
allowed_numa_nodes
[
self
.
local_rank
]
# type: ignore
logical_cpu_list
=
[
x
for
x
in
logical_cpu_list
if
x
.
numa_node
==
selected_numa_node
]
else
:
assert
len
(
logical_cpu_list
)
>=
self
.
parallel_config
.
world_size
logical_cpu_list
=
sorted
(
logical_cpu_list
,
key
=
lambda
x
:
x
.
numa_node
)
sim_cpu_num_per_node
=
(
len
(
logical_cpu_list
)
//
self
.
parallel_config
.
world_size
)
start_idx
=
self
.
local_rank
*
sim_cpu_num_per_node
logical_cpu_list
=
logical_cpu_list
[
start_idx
:
(
start_idx
+
sim_cpu_num_per_node
)
]
# Select CPUs from each physical core via cpu_selector
core_to_cpus
:
dict
[
int
,
list
[
LogicalCPUInfo
]]
=
{}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment