Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
181 additions
and
143 deletions
+181
-143
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
.../nightly-benchmarks/scripts/run-performance-benchmarks.sh
+21
-6
.buildkite/nightly-benchmarks/tests/serving-tests.json
.buildkite/nightly-benchmarks/tests/serving-tests.json
+6
-4
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+9
-9
.buildkite/scripts/hardware_ci/run-amd-test.sh
.buildkite/scripts/hardware_ci/run-amd-test.sh
+23
-7
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+1
-1
.buildkite/scripts/hardware_ci/run-cpu-test.sh
.buildkite/scripts/hardware_ci/run-cpu-test.sh
+11
-7
.buildkite/scripts/hardware_ci/run-gh200-test.sh
.buildkite/scripts/hardware_ci/run-gh200-test.sh
+1
-0
.buildkite/scripts/hardware_ci/run-hpu-test.sh
.buildkite/scripts/hardware_ci/run-hpu-test.sh
+1
-1
.buildkite/scripts/hardware_ci/run-neuron-test.sh
.buildkite/scripts/hardware_ci/run-neuron-test.sh
+1
-1
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+14
-5
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+1
-1
.buildkite/scripts/run-benchmarks.sh
.buildkite/scripts/run-benchmarks.sh
+0
-0
.buildkite/scripts/run-multi-node-test.sh
.buildkite/scripts/run-multi-node-test.sh
+1
-1
.buildkite/scripts/upload-wheels.sh
.buildkite/scripts/upload-wheels.sh
+0
-0
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+20
-16
.github/mergify.yml
.github/mergify.yml
+31
-1
.github/workflows/lint-and-deploy.yaml
.github/workflows/lint-and-deploy.yaml
+1
-1
.pre-commit-config.yaml
.pre-commit-config.yaml
+3
-0
CMakeLists.txt
CMakeLists.txt
+36
-13
Dockerfile.cpu
Dockerfile.cpu
+0
-69
No files found.
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
View file @
fcfc474d
...
...
@@ -10,15 +10,24 @@ set -x
set
-o
pipefail
check_gpus
()
{
if
command
-v
nvidia-smi
;
then
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
elif
command
-v
amd-smi
;
then
declare
-g
gpu_count
=
$(
amd-smi list |
grep
'GPU'
|
wc
-l
)
fi
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
if
command
-v
nvidia-smi
;
then
declare
-g
gpu_type
=
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader |
awk
'{print $2}'
)
elif
command
-v
amd-smi
;
then
declare
-g
gpu_type
=
$(
amd-smi static
-g
0
-a
|
grep
'MARKET_NAME'
|
awk
'{print $2}'
)
fi
echo
"GPU type is
$gpu_type
"
}
...
...
@@ -90,9 +99,15 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB
if
command
-v
nvidia-smi
;
then
while
[
"
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
"
-ge
1000
]
;
do
sleep
1
done
elif
command
-v
amd-smi
;
then
while
[
"
$(
amd-smi metric
-g
0 |
grep
'USED_VRAM'
|
awk
'{print $2}'
)
"
-ge
1000
]
;
do
sleep
1
done
fi
# remove vllm config file
rm
-rf
~/.config/vllm
...
...
.buildkite/nightly-benchmarks/tests/serving-tests.json
View file @
fcfc474d
...
...
@@ -64,9 +64,11 @@
"disable_log_requests"
:
""
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"speculative_model"
:
"turboderp/Qwama-0.5B-Instruct"
,
"speculative_config"
:
{
"model"
:
"turboderp/Qwama-0.5B-Instruct"
,
"num_speculative_tokens"
:
4
,
"speculative_draft_tensor_parallel_size"
:
1
"draft_tensor_parallel_size"
:
1
}
},
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
...
...
.buildkite/release-pipeline.yaml
View file @
fcfc474d
...
...
@@ -3,10 +3,10 @@ steps:
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.4.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.4.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/upload-wheels.sh"
-
"
bash
.buildkite/
scripts/
upload-wheels.sh"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
@@ -14,10 +14,10 @@ steps:
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.1.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.1.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/upload-wheels.sh"
-
"
bash
.buildkite/
scripts/
upload-wheels.sh"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
@@ -31,10 +31,10 @@ steps:
agents
:
queue
:
cpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=11.8.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=11.8.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/upload-wheels.sh"
-
"
bash
.buildkite/
scripts/
upload-wheels.sh"
env
:
DOCKER_BUILDKIT
:
"
1"
...
...
@@ -48,7 +48,7 @@ steps:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.4.0
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
--target
vllm-openai
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.4.0
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
label
:
"
Build
and
publish
TPU
release
image"
...
...
@@ -57,7 +57,7 @@ steps:
agents
:
queue
:
tpu_queue_postmerge
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--tag
vllm/vllm-tpu:nightly
--tag
vllm/vllm-tpu:$BUILDKITE_COMMIT
--progress
plain
-f
Dockerfile.tpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--tag
vllm/vllm-tpu:nightly
--tag
vllm/vllm-tpu:$BUILDKITE_COMMIT
--progress
plain
-f
docker/
Dockerfile.tpu
."
-
"
docker
push
vllm/vllm-tpu:nightly"
-
"
docker
push
vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins
:
...
...
@@ -82,7 +82,7 @@ steps:
queue
:
cpu_queue_postmerge
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
-
f
Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
-
-target
vllm-openai
-f
docker/
Dockerfile.cpu
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)"
env
:
DOCKER_BUILDKIT
:
"
1"
.buildkite/run-amd-test.sh
→
.buildkite/
scripts/hardware_ci/
run-amd-test.sh
View file @
fcfc474d
...
...
@@ -105,19 +105,33 @@ fi
if
[[
$commands
==
*
" entrypoints/openai "
*
]]
;
then
commands
=
${
commands
//
" entrypoints/openai "
/
" entrypoints/openai
\
--ignore=entrypoints/openai/test_audio.py
\
--ignore=entrypoints/openai/test_chat.py
\
--ignore=entrypoints/openai/test_shutdown.py
\
--ignore=entrypoints/openai/test_completion.py
\
--ignore=entrypoints/openai/test_sleep.py
\
--ignore=entrypoints/openai/test_models.py
\
--ignore=entrypoints/openai/test_lora_adapters.py
\
--ignore=entrypoints/openai/test_return_tokens_as_ids.py
\
--ignore=entrypoints/openai/test_root_path.py
\
--ignore=entrypoints/openai/test_tokenization.py
\
--ignore=entrypoints/openai/test_prompt_validation.py "
}
fi
#ignore certain Entrypoints/llm tests
if
[[
$commands
==
*
" && pytest -v -s entrypoints/llm/test_guided_generate.py"
*
]]
;
then
commands
=
${
commands
//
" && pytest -v -s entrypoints/llm/test_guided_generate.py"
/
" "
}
if
[[
$commands
==
*
" entrypoints/llm "
*
]]
;
then
commands
=
${
commands
//
" entrypoints/llm "
/
" entrypoints/llm
\
--ignore=entrypoints/llm/test_chat.py
\
--ignore=entrypoints/llm/test_accuracy.py
\
--ignore=entrypoints/llm/test_init.py
\
--ignore=entrypoints/llm/test_generate_multiple_loras.py
\
--ignore=entrypoints/llm/test_prompt_validation.py "
}
fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
...
...
@@ -134,9 +148,10 @@ if [[ $commands == *"--shard-id="* ]]; then
# assign shard-id for each shard
commands_gpu
=
${
commands
//
"--shard-id= "
/
"--shard-id=
${
GPU
}
"
}
echo
"Shard
${
GPU
}
commands:
$commands_gpu
"
echo
"Render devices:
$BUILDKITE_AGENT_META_DATA_RENDER_DEVICES
"
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--device
/dev/kfd
$BUILDKITE_AGENT_META_DATA_RENDER_DEVICES
\
--network
=
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
"
${
GPU
}
"
\
...
...
@@ -163,9 +178,10 @@ if [[ $commands == *"--shard-id="* ]]; then
fi
done
else
echo
"Render devices:
$BUILDKITE_AGENT_META_DATA_RENDER_DEVICES
"
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--device
/dev/kfd
$BUILDKITE_AGENT_META_DATA_RENDER_DEVICES
\
--network
=
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
0
\
...
...
.buildkite/run-cpu-test-ppc64le.sh
→
.buildkite/
scripts/hardware_ci/
run-cpu-test-ppc64le.sh
View file @
fcfc474d
...
...
@@ -10,5 +10,5 @@ trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
docker build
-t
cpu-test
-f
Dockerfile.ppc64le
.
docker build
-t
cpu-test
-f
docker/
Dockerfile.ppc64le
.
.buildkite/run-cpu-test.sh
→
.buildkite/
scripts/hardware_ci/
run-cpu-test.sh
View file @
fcfc474d
...
...
@@ -8,15 +8,19 @@ set -ex
CORE_RANGE
=
${
CORE_RANGE
:-
48
-95
}
NUMA_NODE
=
${
NUMA_NODE
:-
1
}
# Try building the docker image
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
-t
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-f
Dockerfile.cpu
.
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
--build-arg
VLLM_CPU_DISABLE_AVX512
=
"true"
-t
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-avx2
-f
Dockerfile.cpu
.
# Setup cleanup
remove_docker_container
()
{
set
-e
;
docker
rm
-f
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-
"
$NUMA_NODE
"
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-avx2-
"
$NUMA_NODE
"
||
true
;
}
remove_docker_container
()
{
set
-e
;
docker
rm
-f
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-
"
$NUMA_NODE
"
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-avx2-
"
$NUMA_NODE
"
||
true
;
docker image
rm
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-avx2
||
true
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
--tag
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
--target
vllm-test
-f
docker/Dockerfile.cpu
.
numactl
-C
"
$CORE_RANGE
"
-N
"
$NUMA_NODE
"
docker build
--build-arg
VLLM_CPU_DISABLE_AVX512
=
"true"
--tag
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-avx2
--target
vllm-test
-f
docker/Dockerfile.cpu
.
# Run the image, setting --shm-size=4g for tensor parallel.
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--cpuset-cpus
=
"
$CORE_RANGE
"
\
--cpuset-mems
=
"
$NUMA_NODE
"
--privileged
=
true
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--shm-size
=
4g
--name
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-
"
$NUMA_NODE
"
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
...
...
@@ -36,8 +40,8 @@ function cpu_tests() {
# Run basic model test
docker
exec
cpu-test-
"
$BUILDKITE_BUILD_NUMBER
"
-
"
$NUMA_NODE
"
bash
-c
"
set -e
p
ip install -r vllm/requirements/test.txt
p
ip install -r vllm/requirements/cpu.txt
p
ytest -v -s tests/kernels/test_cache.py -m cpu_model
p
ytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
...
...
.buildkite/run-gh200-test.sh
→
.buildkite/
scripts/hardware_ci/
run-gh200-test.sh
View file @
fcfc474d
...
...
@@ -9,6 +9,7 @@ python3 use_existing_torch.py
# Try building the docker image
DOCKER_BUILDKIT
=
1 docker build
.
\
--file
docker/Dockerfile
\
--target
vllm-openai
\
--platform
"linux/arm64"
\
-t
gh200-test
\
...
...
.buildkite/run-hpu-test.sh
→
.buildkite/
scripts/hardware_ci/
run-hpu-test.sh
View file @
fcfc474d
...
...
@@ -5,7 +5,7 @@
set
-ex
# Try building the docker image
docker build
-t
hpu-test-env
-f
Dockerfile.hpu
.
docker build
-t
hpu-test-env
-f
docker/
Dockerfile.hpu
.
# Setup cleanup
# certain versions of HPU software stack have a bug that can
...
...
.buildkite/run-neuron-test.sh
→
.buildkite/
scripts/hardware_ci/
run-neuron-test.sh
View file @
fcfc474d
...
...
@@ -35,7 +35,7 @@ else
date
"+%s"
>
/tmp/neuron-docker-build-timestamp
fi
docker build
-t
"
${
image_name
}
"
-f
Dockerfile.neuron
.
docker build
-t
"
${
image_name
}
"
-f
docker/
Dockerfile.neuron
.
# Setup cleanup
remove_docker_container
()
{
...
...
.buildkite/run-tpu-v1-test.sh
→
.buildkite/
scripts/hardware_ci/
run-tpu-v1-test.sh
View file @
fcfc474d
#!/bin/bash
set
-e
set
-
xu
e
# Build the docker image.
docker build
-f
Dockerfile.tpu
-t
vllm-tpu
.
docker build
-f
docker/
Dockerfile.tpu
-t
vllm-tpu
.
# Set up cleanup.
remove_docker_container
()
{
docker
rm
-f
tpu-test
||
true
;
}
...
...
@@ -21,8 +21,10 @@ docker run --privileged --net host --shm-size=16G -it \
&& python3 -m pip install lm_eval[api]==0.4.4
\
&& export VLLM_USE_V1=1
\
&& export VLLM_XLA_CHECK_RECOMPILATION=1
\
&& echo TEST_0
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py
\
&& echo TEST_1
\
&& pytest /workspace/vllm/tests/tpu/test_compilation.py
\
&& pytest
-v -s
/workspace/vllm/tests/tpu/test_compilation.py
\
&& echo TEST_2
\
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py
\
&& echo TEST_3
\
...
...
@@ -30,9 +32,16 @@ docker run --privileged --net host --shm-size=16G -it \
&& echo TEST_4
\
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py
\
&& echo TEST_5
\
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"
\
&& python3 /workspace/vllm/examples/offline_inference/tpu.py
\
&& echo TEST_6
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py
\
&& echo TEST_7
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py
\
&& echo TEST_8
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py
\
&& echo TEST_9
\
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
\
# TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
.buildkite/run-xpu-test.sh
→
.buildkite/
scripts/hardware_ci/
run-xpu-test.sh
View file @
fcfc474d
...
...
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name
=
"xpu_
${
BUILDKITE_COMMIT
}
_
$(
tr
-dc
A-Za-z0-9 < /dev/urandom |
head
-c
10
;
echo
)
"
# Try building the docker image
docker build
-t
${
image_name
}
-f
Dockerfile.xpu
.
docker build
-t
${
image_name
}
-f
docker/
Dockerfile.xpu
.
# Setup cleanup
remove_docker_container
()
{
...
...
.buildkite/run-benchmarks.sh
→
.buildkite/
scripts/
run-benchmarks.sh
View file @
fcfc474d
File moved
.buildkite/run-multi-node-test.sh
→
.buildkite/
scripts/
run-multi-node-test.sh
View file @
fcfc474d
...
...
@@ -3,7 +3,7 @@
set
-euox
pipefail
if
[[
$#
-lt
4
]]
;
then
echo
"Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
echo
"Usage: .buildkite/
scripts/
run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit
1
fi
...
...
.buildkite/upload-wheels.sh
→
.buildkite/
scripts/
upload-wheels.sh
View file @
fcfc474d
File moved
.buildkite/test-pipeline.yaml
View file @
fcfc474d
...
...
@@ -104,7 +104,7 @@ steps:
-
label
:
Entrypoints Test
# 40min
working_dir
:
"
/vllm-workspace/tests"
fast_check
:
true
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
-
tests/entrypoints/llm
...
...
@@ -135,12 +135,14 @@ steps:
-
examples/offline_inference/rlhf.py
-
examples/offline_inference/rlhf_colocate.py
-
tests/examples/offline_inference/data_parallel.py
-
tests/v1/test_async_llm_dp.py
commands
:
# test with tp=2 and external_dp=2
-
VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
-
torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with internal dp
-
python3 ../examples/offline_inference/data_parallel.py
-
TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-
pytest -v -s distributed/test_utils.py
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s distributed/test_pynccl.py
...
...
@@ -148,11 +150,12 @@ steps:
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
-
pushd ../examples/offline_inference
-
VLLM_ENABLE_V1_MULTIPROCESSING=0
python3 rlhf.py
-
VLLM_ENABLE_V1_MULTIPROCESSING=0
RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-
python3 rlhf.py
-
RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-
popd
-
label
:
Metrics, Tracing Test
# 10min
mirror_hardwares
:
[
amd
]
num_gpus
:
2
source_file_dependencies
:
-
vllm/
...
...
@@ -171,7 +174,7 @@ steps:
##### 1 GPU test #####
-
label
:
Regression Test
# 5min
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
-
tests/test_regression
...
...
@@ -202,7 +205,6 @@ steps:
commands
:
# split the test to avoid interference
-
pytest -v -s v1/core
-
pytest -v -s v1/entrypoints
-
pytest -v -s v1/engine
-
pytest -v -s v1/entrypoints
-
pytest -v -s v1/sample
...
...
@@ -283,11 +285,11 @@ steps:
-
pytest -v -s spec_decode/e2e/test_eagle_correctness.py
-
label
:
LoRA Test %N
# 15min each
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/lora
-
tests/lora
command
:
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
--ignore=lora/test_long_context.py
--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
--ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
command
:
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism
:
4
-
label
:
PyTorch Fullgraph Smoke Test
# 9min
...
...
@@ -309,7 +311,7 @@ steps:
-
pytest -v -s compile/test_full_graph.py
-
label
:
Kernels Test %N
# 1h each
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [amd]
source_file_dependencies
:
-
csrc/
-
vllm/attention
...
...
@@ -319,7 +321,7 @@ steps:
parallelism
:
4
-
label
:
Tensorizer Test
# 11min
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [amd]
soft_fail
:
true
source_file_dependencies
:
-
vllm/model_executor/model_loader
...
...
@@ -335,7 +337,7 @@ steps:
source_file_dependencies
:
-
benchmarks/
commands
:
-
bash run-benchmarks.sh
-
bash
scripts/
run-benchmarks.sh
-
label
:
Quantization Test
# 33min
source_file_dependencies
:
...
...
@@ -370,7 +372,7 @@ steps:
-
label
:
OpenAI-Compatible Tool Use
# 20 min
fast_check
:
false
mirror_hardwares
:
[
amd
]
#
mirror_hardwares: [ amd ]
source_file_dependencies
:
-
vllm/
-
tests/tool_use
...
...
@@ -429,6 +431,7 @@ steps:
-
pytest -v -s models/encoder_decoder/audio_language -m core_model
-
pytest -v -s models/encoder_decoder/language -m core_model
-
pytest -v -s models/encoder_decoder/vision_language -m core_model
-
pytest -v -s models/decoder_only/vision_language/test_interleaved.py
-
label
:
Multi-Modal Models Test (Extended)
1
# 48m
optional
:
true
...
...
@@ -461,6 +464,7 @@ steps:
# This test is used only in PR development phase to test individual models and should never run on main
-
label
:
Custom Models Test
mirror_hardwares
:
[
amd
]
optional
:
true
commands
:
-
echo 'Testing custom models...'
...
...
@@ -472,6 +476,7 @@ steps:
##### multi gpus test #####
-
label
:
Distributed Comm Ops Test
# 7min
mirror_hardwares
:
[
amd
]
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
...
...
@@ -514,8 +519,11 @@ steps:
-
vllm/worker/worker.py
-
vllm/worker/model_runner.py
-
entrypoints/llm/test_collective_rpc.py
-
tests/v1/test_async_llm_dp.py
-
vllm/v1/engine/
commands
:
-
VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
-
TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-
pytest -v -s entrypoints/llm/test_collective_rpc.py
-
pytest -v -s ./compile/test_basic_correctness.py
-
pytest -v -s ./compile/test_wrapper.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
...
...
@@ -592,14 +600,10 @@ steps:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
-
pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
-
pytest -v -s -x lora/test_chatglm3_tp.py
-
pytest -v -s -x lora/test_llama_tp.py
-
pytest -v -s -x lora/test_minicpmv_tp.py
-
pytest -v -s -x lora/test_transfomers_model.py
-
label
:
Weight Loading Multiple GPU Test
# 33min
...
...
.github/mergify.yml
View file @
fcfc474d
...
...
@@ -19,7 +19,7 @@ pull_request_rules:
-
files~=\.buildkite/
-
files~=^cmake/
-
files=CMakeLists.txt
-
files~=^Dockerfile
-
files~=^
docker/
Dockerfile
-
files~=^requirements.*\.txt
-
files=setup.py
actions
:
...
...
@@ -88,6 +88,36 @@ pull_request_rules:
add
:
-
v1
-
name
:
label-tpu
description
:
Automatically apply tpu label
# Keep this list in sync with `label-tpu-remove` conditions
conditions
:
-
or
:
-
files~=tpu.py
-
files~=_tpu
-
files~=tpu_
-
files~=/tpu/
-
files~=pallas
actions
:
label
:
add
:
-
tpu
-
name
:
label-tpu-remove
description
:
Automatically remove tpu label
# Keep this list in sync with `label-tpu` conditions
conditions
:
-
and
:
-
-files~=tpu.py
-
-files~=_tpu
-
-files~=tpu_
-
-files~=/tpu/
-
-files~=pallas
actions
:
label
:
remove
:
-
tpu
-
name
:
ping author on conflicts and add 'needs-rebase' label
conditions
:
-
conflict
...
...
.github/workflows/lint-and-deploy.yaml
View file @
fcfc474d
...
...
@@ -50,7 +50,7 @@ jobs:
uses
:
helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3
# v1.12.0
-
name
:
Build the Docker image vllm cpu
run
:
docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
run
:
docker buildx build -f
docker/
Dockerfile.cpu -t vllm-cpu-env .
-
name
:
Configuration of docker images, network and namespace for the kind cluster
run
:
|
...
...
.pre-commit-config.yaml
View file @
fcfc474d
default_install_hook_types
:
-
pre-commit
-
commit-msg
default_stages
:
-
pre-commit
# Run locally
-
manual
# Run in CI
...
...
CMakeLists.txt
View file @
fcfc474d
...
...
@@ -37,8 +37,8 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set
(
CUDA_SUPPORTED_ARCHS
"7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0"
)
# Supported
hcu
architectures.
set
(
HIP_SUPPORTED_ARCHS
"gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx
906;gfx926
;gfx928;gfx936"
)
# Supported
AMD GPU
architectures.
set
(
HIP_SUPPORTED_ARCHS
"gfx906;gfx908;gfx90a;gfx942;
gfx950;
gfx1030;gfx1100;gfx1101;gfx
1200;gfx1201
;gfx928;gfx936"
)
#
# Supported/expected torch versions for CUDA/ROCm.
...
...
@@ -48,7 +48,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx
#
# Note: the CUDA torch version is derived from pyproject.toml and various
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
# versions are derived from
docker/
Dockerfile.rocm
#
set
(
TORCH_SUPPORTED_VERSION_CUDA
"2.6.0"
)
set
(
TORCH_SUPPORTED_VERSION_ROCM
"2.6.0"
)
...
...
@@ -242,8 +242,12 @@ set(VLLM_EXT_SRC
"csrc/opt/activation_kernels_opt.cu"
"csrc/attention/attention_kernels_opt.cu"
"csrc/attention/attention_kernels_opt_tc.cu"
"csrc/attention/attention_with_mask_kernels.cu"
"csrc/attention/attention_with_mask_kernels_opt.cu"
"csrc/attention/attention_with_mask_kernels_opt_tc.cu"
"csrc/opt/layernorm_kernels_opt.cu"
# "csrc/layernorm_quant_kernels.cu"
"csrc/cuda_view.cu"
# "csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
# "csrc/quantization/fp8/common.cu"
...
...
@@ -251,10 +255,8 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp"
"csrc/attention/attention_with_mask_kernels.cu"
"csrc/attention/attention_with_mask_kernels_opt.cu"
"csrc/attention/attention_with_mask_kernels_opt_tc.cu"
)
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp"
)
if
(
VLLM_GPU_LANG STREQUAL
"CUDA"
)
SET
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
...
...
@@ -295,7 +297,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
...
...
@@ -474,6 +475,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set
(
FP4_ARCHS
)
endif
()
#
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output.
cuda_archs_loose_intersection
(
SCALED_MM_ARCHS
"9.0a;"
"
${
CUDA_ARCHS
}
"
)
if
(
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS
)
set
(
SRCS
"csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu"
)
set_gencode_flags_for_srcs
(
SRCS
"
${
SRCS
}
"
CUDA_ARCHS
"
${
SCALED_MM_ARCHS
}
"
)
list
(
APPEND VLLM_EXT_SRC
"
${
SRCS
}
"
)
list
(
APPEND VLLM_GPU_FLAGS
"-DENABLE_CUTLASS_MOE_SM90=1"
)
message
(
STATUS
"Building grouped_mm_c3x for archs:
${
SCALED_MM_ARCHS
}
"
)
else
()
if
(
NOT
${
CMAKE_CUDA_COMPILER_VERSION
}
VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS
)
message
(
STATUS
"Not building grouped_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
"if you intend on running FP8 quantized MoE models on Hopper."
)
else
()
message
(
STATUS
"Not building grouped_mm_c3x as no compatible archs found "
"in CUDA target architectures"
)
endif
()
endif
()
#
# Machete kernels
...
...
@@ -546,11 +574,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# if CUDA endif
endif
()
if
(
VLLM_GPU_LANG STREQUAL
"HIP"
)
list
(
APPEND VLLM_EXT_SRC
"csrc/custom_all_reduce.cu"
)
endif
()
message
(
STATUS
"Enabling C extension."
)
define_gpu_extension_target
(
_C
...
...
Dockerfile.cpu
deleted
100644 → 0
View file @
bb94d2e5
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
FROM ubuntu:22.04 AS cpu-test-1
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# intel-openmp provides additional performance improvement vs. openmp
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-openmp==2025.0.1
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
pip install dist/*.whl && \
rm -rf dist
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Prev
1
2
3
4
5
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment