Commit 41199996 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.12.0' into v0.12.0-dev

parents 31021d81 4fd9d6a8
[
{
"test_name": "throughput_llama8B_tp1",
"test_name": "throughput_llama8B_tp2",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
......
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
},
{
"test_name": "throughput_llama70B_tp4",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
},
{
"test_name": "throughput_mixtral8x7B_tp2",
"environment_variables": {
"PT_HPU_LAZY_MODE": 1,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
"VLLM_CONTIGUOUS_PA": 1,
"VLLM_DEFRAG": 1
},
"parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"tensor_parallel_size": 2,
"load_format": "dummy",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 1000,
"backend": "vllm",
"max-model-len": 2048,
"max-num-seqs": 512,
"async-scheduling": ""
}
}
]
# This local pyproject file is part of the migration from yapf to ruff format.
# It uses the same core rules as the main pyproject.toml file, but with the
# following differences:
# - ruff line length is overridden to 88
# - deprecated typing ignores (UP006, UP035) have been removed
[tool.ruff]
line-length = 88
[tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
[tool.ruff.lint]
select = [
# pycodestyle
"E",
# Pyflakes
"F",
# pyupgrade
"UP",
# flake8-bugbear
"B",
# flake8-simplify
"SIM",
# isort
"I",
# flake8-logging-format
"G",
]
ignore = [
# star imports
"F405", "F403",
# lambda expression assignment
"E731",
# Loop control variable not used within loop body
"B007",
# f-string format
"UP032",
# Can remove once 3.10+ is the minimum Python version
"UP007",
]
[tool.ruff.format]
docstring-code-format = true
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
# aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
......@@ -8,53 +8,55 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.8"
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
id: build-wheel-cuda-12-8
id: build-wheel-arm64-cpu
agents:
queue: cpu_queue_postmerge
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-6
id: build-wheel-cuda-12-9
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
- label: "Build wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-12-9
id: build-wheel-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# Build release images (12.9)
- label: "Build release image (x86)"
depends_on: ~
id: build-release-image-x86
......@@ -62,13 +64,12 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
......@@ -76,7 +77,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
......@@ -95,31 +96,12 @@ steps:
- label: "Annotate release workflow"
depends_on:
- create-multi-arch-manifest
- build-wheel-cuda-12-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
agents:
queue: tpu_queue_postmerge
commands:
- "yes | docker system prune -a"
- "git fetch --all"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
id: input-release-version
fields:
......@@ -136,12 +118,28 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- block: "Build arm64 CPU release image"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build and publish arm64 CPU release image"
depends_on: block-arm64-cpu-release-image-build
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
......@@ -150,11 +148,16 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker push vllm/vllm-openai:nightly"
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
- "docker push vllm/vllm-openai:nightly-x86_64"
- "docker push vllm/vllm-openai:nightly-aarch64"
- "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
- "docker manifest push vllm/vllm-openai:nightly"
- "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
plugins:
......@@ -163,3 +166,4 @@ steps:
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
......@@ -2,22 +2,29 @@
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev"
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download the wheel (by version):
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
......@@ -38,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF
\ No newline at end of file
EOF
......@@ -8,20 +8,41 @@ set -ex
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub token from environment
# Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
exit 1
fi
if [ -z "$DOCKERHUB_USERNAME" ]; then
echo "Error: DOCKERHUB_USERNAME environment variable is not set"
exit 1
fi
# Get DockerHub bearer token
echo "Getting DockerHub bearer token..."
set +x
BEARER_TOKEN=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
"https://hub.docker.com/v2/users/login" | jq -r '.token')
set -x
if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
echo "Error: Failed to get DockerHub bearer token"
exit 1
fi
# Function to get all tags from DockerHub
get_all_tags() {
local page=1
local all_tags=""
while true; do
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
set +x
local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
"$REPO_API_URL?page=$page&page_size=100")
set -x
# Get both last_updated timestamp and tag name, separated by |
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
......@@ -43,7 +64,9 @@ delete_tag() {
echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
......
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# do not complain about line length (for docstring)
# ruff: noqa: E501
import argparse
import json
import re
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
from urllib.parse import quote
if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.")
INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
<html>
<meta name="pypi:repository-version" content="1.0">
<body>
{items}
</body>
</html>
"""
@dataclass
class WheelFileInfo:
package_name: str
version: str
build_tag: str | None
python_tag: str
abi_tag: str
platform_tag: str
variant: str | None
filename: str
def parse_from_filename(file: str) -> WheelFileInfo:
"""
Parse wheel file name to extract metadata.
The format of wheel names:
{package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
Example:
vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
"""
wheel_file_re = re.compile(
r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
)
match = wheel_file_re.match(file)
if not match:
raise ValueError(f"Invalid wheel file name: {file}")
package_name = match.group("package_name")
version = match.group("version")
build_tag = match.group("build_tag")
python_tag = match.group("python_tag")
abi_tag = match.group("abi_tag")
platform_tag = match.group("platform_tag")
# extract variant from version
variant = None
if "dev" in version:
ver_after_dev = version.split("dev")[-1]
if "." in ver_after_dev:
variant = ver_after_dev.split(".")[-1]
version = version.removesuffix("." + variant)
else:
if "+" in version:
version, variant = version.split("+")
return WheelFileInfo(
package_name=package_name,
version=version,
build_tag=build_tag,
python_tag=python_tag,
abi_tag=abi_tag,
platform_tag=platform_tag,
variant=variant,
filename=file,
)
def generate_project_list(subdir_names: list[str]) -> str:
"""
Generate project list HTML content linking to each project & variant sub-directory.
"""
href_tags = []
for name in sorted(subdir_names):
name = name.strip("/").strip(".")
href_tags.append(f' <a href="{name}/">{name}/</a><br/>')
return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
def generate_package_index_and_metadata(
wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
) -> tuple[str, str]:
"""
Generate package index HTML content for a specific package, linking to actual wheel files.
"""
href_tags = []
metadata = []
for file in sorted(wheel_files, key=lambda x: x.filename):
relative_path = (
wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
)
# handle with '+' in URL, and avoid double-encoding '/' and already-encoded '%2B'
# NOTE: this is AWS S3 specific behavior!
file_path_quoted = quote(relative_path.as_posix(), safe=":%/")
href_tags.append(f' <a href="{file_path_quoted}">{file.filename}</a><br/>')
file_meta = asdict(file)
file_meta["path"] = file_path_quoted
metadata.append(file_meta)
index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
metadata_str = json.dumps(metadata, indent=2)
return index_str, metadata_str
def generate_index_and_metadata(
whl_files: list[str],
wheel_base_dir: Path,
index_base_dir: Path,
default_variant: str | None = None,
alias_to_default: str | None = None,
):
"""
Generate index for all wheel files.
Args:
whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
wheel_base_dir (Path): Base directory for wheel files.
index_base_dir (Path): Base directory to store index files.
default_variant (str | None): The default variant name, if any.
alias_to_default (str | None): Alias variant name for the default variant, if any.
First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
as the default variant index, but the links are adjusted accordingly.
Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories
vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory
index.html
vllm/
index.html
metadata.json
cu129/ # cu129 is actually the alias to default variant
index.html
vllm/
index.html
metadata.json
cu130/ # cu130 variant sub-directory
index.html
vllm/
index.html
metadata.json
...
metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
[
{
"package_name": "vllm",
"version": "0.10.2rc2",
"build_tag": null,
"python_tag": "cp38",
"abi_tag": "abi3",
"platform_tag": "manylinux2014_aarch64",
"variant": "cu129",
"filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
"path": "../vllm-0.10.2rc2%2Bcu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL and URL-encoded
},
...
]
"""
parsed_files = [parse_from_filename(f) for f in whl_files]
if not parsed_files:
print("No wheel files found, skipping index generation.")
return
# Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files:
variant = file.variant or "default"
if variant not in variant_to_files:
variant_to_files[variant] = []
variant_to_files[variant].append(file)
print(f"Found variants: {list(variant_to_files.keys())}")
# sanity check for default variant
if default_variant:
if "default" in variant_to_files:
raise ValueError(
"All wheel files must have variant suffixes when `default_variant` is specified."
)
if default_variant not in variant_to_files:
raise ValueError(
f"Default variant '{default_variant}' not found among wheel files."
)
if alias_to_default:
if "default" not in variant_to_files:
# e.g. only some wheels are uploaded to S3 currently
print(
"[WARN] Alias to default variant specified, but no default variant found."
)
elif alias_to_default in variant_to_files:
raise ValueError(
f"Alias variant name '{alias_to_default}' already exists among wheel files."
)
else:
variant_to_files[alias_to_default] = variant_to_files["default"].copy()
print(f"Alias variant '{alias_to_default}' created for default variant.")
# Generate index for each variant
subdir_names = set()
for variant, files in variant_to_files.items():
if variant == "default":
variant_dir = index_base_dir
else:
variant_dir = index_base_dir / variant
subdir_names.add(variant)
variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant
packages = set(f.package_name for f in files)
if variant == "default":
# these packages should also appear in the "project list"
# generate after all variants are processed
subdir_names = subdir_names.union(packages)
else:
# generate project list for this variant directly
project_list_str = generate_project_list(sorted(packages))
with open(variant_dir / "index.html", "w") as f:
f.write(project_list_str)
for package in packages:
# filter files belonging to this package only
package_files = [f for f in files if f.package_name == package]
package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata(
package_files, wheel_base_dir, package_dir
)
with open(package_dir / "index.html", "w") as f:
f.write(index_str)
with open(package_dir / "metadata.json", "w") as f:
f.write(metadata_str)
# Generate top-level project list index
project_list_str = generate_project_list(sorted(subdir_names))
with open(index_base_dir / "index.html", "w") as f:
f.write(project_list_str)
if __name__ == "__main__":
"""
Arguments:
--version <version> : version string for the current build (e.g., commit hash)
--current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
--output-dir <output_directory> : directory to store generated index files
--alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
"""
parser = argparse.ArgumentParser(
description="Process nightly build wheel files to generate indices."
)
parser.add_argument(
"--version",
type=str,
required=True,
help="Version string for the current build (e.g., commit hash)",
)
parser.add_argument(
"--current-objects",
type=str,
required=True,
help="Path to JSON file containing current S3 objects listing in this version directory",
)
parser.add_argument(
"--output-dir",
type=str,
required=True,
help="Directory to store generated index files",
)
parser.add_argument(
"--alias-to-default",
type=str,
default=None,
help="Alias variant name for the default variant",
)
args = parser.parse_args()
version = args.version
if "/" in version or "\\" in version:
raise ValueError("Version string must not contain slashes.")
current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir)
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
# Read current objects JSON
with open(current_objects_path) as f:
current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
# current_objects looks like from list_objects_v2 S3 API:
"""
"Contents": [
{
"Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
"LastModified": "2025-11-28T14:00:32+00:00",
"ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
"ChecksumAlgorithm": [
"CRC64NVME"
],
"ChecksumType": "FULL_OBJECT",
"Size": 435649349,
"StorageClass": "STANDARD"
},
...
]
"""
# Extract wheel file keys
wheel_files = []
for item in current_objects.get("Contents", []):
key: str = item["Key"]
if key.endswith(".whl"):
wheel_files.append(key.split("/")[-1]) # only the filename is used
print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
# Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{version}/<wheel files>
# s3://vllm-wheels/<anything>/<index files>
wheel_base_dir = Path(output_dir).parent / version
index_base_dir = Path(output_dir)
generate_index_and_metadata(
whl_files=wheel_files,
wheel_base_dir=wheel_base_dir,
index_base_dir=index_base_dir,
default_variant=None,
alias_to_default=args.alias_to_default,
)
print(f"Successfully generated index and metadata in {output_dir}")
......@@ -59,7 +59,7 @@ while true; do
fi
done
echo "--- Pulling container"
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
......@@ -78,17 +78,13 @@ HF_MOUNT="/root/.cache/huggingface"
commands=$@
echo "Commands:$commands"
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
fi
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
fi
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
......@@ -173,19 +169,28 @@ fi
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
# assign job count as the number of shards used
commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
......@@ -217,8 +222,8 @@ else
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
......
This diff is collapsed.
......@@ -25,25 +25,30 @@ function cpu_tests() {
# offline inference
podman exec -it "$container_id" bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
export TORCH_COMPILE_DISABLE=1
set -xve
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
# Run basic model test
podman exec -it "$container_id" bash -c "
set -e
export TORCH_COMPILE_DISABLE=1
set -evx
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
pip install sentence-transformers datamodel_code_generator
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pip install sentence-transformers datamodel_code_generator tblib
# Note: disable Bart until supports V1
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
}
# All of CPU tests are expected to be finished less than 40 mins.
export container_id
export -f cpu_tests
timeout 40m bash -c cpu_tests
timeout 120m bash -c cpu_tests
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment