Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import sys import sys
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import os import os
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path from pathlib import Path
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
LM eval harness on model to compare vs HF baseline computed offline. LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml Configs are found in configs/$MODEL.yaml
......
...@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do ...@@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
### Visualizing the results ### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running. If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import os import os
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse import argparse
import json import json
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime import datetime
import json import json
......
steps: steps:
- label: "Build wheel - CUDA 12.8" - label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
...@@ -11,6 +12,7 @@ steps: ...@@ -11,6 +12,7 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6" - label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
...@@ -28,6 +30,7 @@ steps: ...@@ -28,6 +30,7 @@ steps:
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
...@@ -44,6 +47,7 @@ steps: ...@@ -44,6 +47,7 @@ steps:
- label: "Build release image" - label: "Build release image"
depends_on: block-release-image-build depends_on: block-release-image-build
id: build-release-image
agents: agents:
queue: cpu_queue_postmerge queue: cpu_queue_postmerge
commands: commands:
...@@ -51,6 +55,18 @@ steps: ...@@ -51,6 +55,18 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image" - label: "Build and publish TPU release image"
depends_on: ~ depends_on: ~
if: build.env("NIGHTLY") == "1" if: build.env("NIGHTLY") == "1"
...@@ -70,9 +86,10 @@ steps: ...@@ -70,9 +86,10 @@ steps:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- input: "Provide Release version here" - input: "Provide Release version here"
id: input-release-version
fields: fields:
- text: "What is the release version?" - text: "What is the release version?"
key: "release-version" key: release-version
- block: "Build CPU release image" - block: "Build CPU release image"
key: block-cpu-release-image-build key: block-cpu-release-image-build
......
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
...@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then ...@@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests #ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \ commands="${commands} \
......
...@@ -7,6 +7,7 @@ set -ex ...@@ -7,6 +7,7 @@ set -ex
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
if [[ -n "$container_id" ]]; then if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true podman rm -f "$container_id" || true
fi fi
podman system prune -f podman system prune -f
...@@ -37,7 +38,7 @@ function cpu_tests() { ...@@ -37,7 +38,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]" pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
} }
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
......
...@@ -6,72 +6,70 @@ set -ex ...@@ -6,72 +6,70 @@ set -ex
# allow to bind to different cores # allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95} CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1} NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup # Setup cleanup
remove_docker_container() { remove_docker_container() {
set -e; set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Try building the docker image # Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() { function cpu_tests() {
set -e set -e
export NUMA_NODE=$2 export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference # offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test # Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -v -s tests/kernels/test_cache.py -m cpu_model pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model pytest -v -s tests/models/language/generation -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/multimodal/generation \
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model --ignore=tests/models/multimodal/generation/test_mllama.py \
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" --ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test # Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py" tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test # Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v -k cpu_model \ pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py" tests/basic_correctness/test_chunked_prefill.py"
# online serving # online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \ python3 benchmarks/benchmark_serving.py \
...@@ -83,7 +81,7 @@ function cpu_tests() { ...@@ -83,7 +81,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m" --tokenizer facebook/opt-125m"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
...@@ -91,4 +89,4 @@ function cpu_tests() { ...@@ -91,4 +89,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins. # All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
...@@ -2,102 +2,184 @@ ...@@ -2,102 +2,184 @@
set -xu set -xu
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup. remove_docker_container() {
remove_docker_container() { docker rm -f tpu-test || true; } docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run. # Remove the container that might not be cleaned up in the previous run.
remove_docker_container remove_docker_container
# Build the docker image.
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
}
cleanup_docker
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \ docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ vllm-tpu /bin/bash -c '
&& python3 -m pip install pytest pytest-asyncio tpu-info \ set -e # Exit immediately if a command exits with a non-zero status.
&& python3 -m pip install lm_eval[api]==0.4.4 \ set -u # Treat unset variables as an error.
&& export VLLM_XLA_CACHE_PATH= \
&& export VLLM_USE_V1=1 \ echo "--- Starting script inside Docker container ---"
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
&& echo HARDWARE \ # Create results directory
&& tpu-info \ RESULTS_DIR=$(mktemp -d)
&& { \ # If mktemp fails, set -e will cause the script to exit.
echo TEST_0: Running test_perf.py; \ echo "Results will be stored in: $RESULTS_DIR"
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
echo TEST_0_EXIT_CODE: \$?; \ # Install dependencies
} & \ echo "--- Installing Python dependencies ---"
{ \ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
echo TEST_1: Running test_compilation.py; \ && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
echo TEST_1_EXIT_CODE: \$?; \ echo "--- Python dependencies installed ---"
} & \ export VLLM_USE_V1=1
{ \ export VLLM_XLA_CHECK_RECOMPILATION=1
echo TEST_2: Running test_basic.py; \ export VLLM_XLA_CACHE_PATH=
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ echo "Using VLLM V1"
echo TEST_2_EXIT_CODE: \$?; \
} & \ echo "--- Hardware Information ---"
{ \ tpu-info
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ echo "--- Starting Tests ---"
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ set +e
echo TEST_3_EXIT_CODE: \$?; \ overall_script_exit_code=0
} & \
{ \ # --- Test Definitions ---
echo TEST_4: Running test_quantization_accuracy.py; \ # If a test fails, this function will print logs and will not cause the main script to exit.
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ run_test() {
echo TEST_4_EXIT_CODE: \$?; \ local test_num=$1
} & \ local test_name=$2
{ \ local test_command=$3
echo TEST_5: Running examples/offline_inference/tpu.py; \ local log_file="$RESULTS_DIR/test_${test_num}.log"
python3 /workspace/vllm/examples/offline_inference/tpu.py; \ local actual_exit_code
echo TEST_5_EXIT_CODE: \$?; \
} & \ echo "--- TEST_$test_num: Running $test_name ---"
{ \
echo TEST_6: Running test_tpu_model_runner.py; \ # Execute the test command.
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
echo TEST_6_EXIT_CODE: \$?; \ actual_exit_code=$?
} & \
{ \ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
echo TEST_7: Running test_sampler.py; \ echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
echo TEST_7_EXIT_CODE: \$?; \ if [ "$actual_exit_code" -ne 0 ]; then
} & \ echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
{ \ echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
echo TEST_8: Running test_topk_topp_sampler.py; \ if [ -f "$log_file" ]; then
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ cat "$log_file" >&2
echo TEST_8_EXIT_CODE: \$?; \ else
} & \ echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
{ \ fi
echo TEST_9: Running test_multimodal.py; \ echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ return "$actual_exit_code" # Return the failure code
echo TEST_9_EXIT_CODE: \$?; \ else
} & \ echo "TEST_$test_num ($test_name) PASSED."
{ \ return 0 # Return success
echo TEST_10: Running test_pallas.py; \ fi
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ }
echo TEST_10_EXIT_CODE: \$?; \
} & \ # Helper function to call run_test and update the overall script exit code
{ \ run_and_track_test() {
echo TEST_11: Running test_struct_output_generate.py; \ local test_num_arg="$1"
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ local test_name_arg="$2"
echo TEST_11_EXIT_CODE: \$?; \ local test_command_arg="$3"
} & \
{ \ # Run the test
echo TEST_12: Running test_moe_pallas.py; \ run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ local test_specific_exit_code=$?
echo TEST_12_EXIT_CODE: \$?; \
} & \ # If the test failed, set the overall script exit code to 1
# Disable the TPU LoRA tests until the feature is activated if [ "$test_specific_exit_code" -ne 0 ]; then
# & { \ # No need for extra echo here, run_test already logged the failure.
# echo TEST_13: Running test_moe_pallas.py; \ overall_script_exit_code=1
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ fi
# echo TEST_13_EXIT_CODE: \$?; \ }
# } & \
wait \ # --- Actual Test Execution ---
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ run_and_track_test 0 "test_perf.py" \
" "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
run_and_track_test 1 "test_compilation.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
run_and_track_test 6 "test_tpu_model_runner.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
run_and_track_test 7 "test_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
run_and_track_test 8 "test_topk_topp_sampler.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
run_and_track_test 9 "test_multimodal.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
else
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
fi
exit "$overall_script_exit_code"
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
# Capture the exit code of the docker run command
DOCKER_RUN_EXIT_CODE=$?
# The trap will run for cleanup.
# Exit the main script with the Docker run command's exit code.
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
exit "$DOCKER_RUN_EXIT_CODE"
else
echo "Docker run command completed successfully."
exit 0
fi
# TODO: This test fails because it uses RANDOM_SEED sampling # TODO: This test fails because it uses RANDOM_SEED sampling
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment