Commit 61b1a9d8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1' into v0.18.1-ori

parents 0da93439 a26e8dc7
......@@ -12,7 +12,7 @@ steps:
depends_on: ~
id: build-wheel-arm64-cuda-12-9
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
......@@ -27,7 +27,7 @@ steps:
depends_on: ~
id: build-wheel-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
......@@ -42,7 +42,7 @@ steps:
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
......@@ -55,7 +55,7 @@ steps:
depends_on: ~
id: build-wheel-x86-cuda-12-9
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
......@@ -68,7 +68,7 @@ steps:
depends_on: ~
id: build-wheel-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
......@@ -81,7 +81,7 @@ steps:
depends_on: ~
id: build-wheel-x86-cpu
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
......@@ -97,7 +97,7 @@ steps:
depends_on: ~
id: build-release-image-x86
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
......@@ -110,7 +110,7 @@ steps:
depends_on: ~
id: build-release-image-arm64
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
......@@ -120,7 +120,7 @@ steps:
depends_on: ~
id: build-release-image-x86-cuda-13-0
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
......@@ -133,13 +133,57 @@ steps:
depends_on: ~
id: build-release-image-arm64-cuda-13-0
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130"
- label: "Build release image - x86_64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 12.9 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-ubuntu2404"
- label: "Build release image - x86_64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-x86-cuda-13-0-ubuntu2404
agents:
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- label: "Build release image - aarch64 - CUDA 13.0 - Ubuntu 24.04"
depends_on: ~
id: build-release-image-arm64-cuda-13-0-ubuntu2404
agents:
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg UBUNTU_VERSION=24.04 --build-arg GDRCOPY_OS_VERSION=Ubuntu24_04 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0 12.1' --build-arg INSTALL_KV_CONNECTORS=true --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404 --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)-cu130-ubuntu2404"
- block: "Build release image for x86_64 CPU"
key: block-cpu-release-image-build
depends_on: ~
......@@ -148,8 +192,9 @@ steps:
depends_on:
- block-cpu-release-image-build
- input-release-version
id: build-release-image-x86-cpu
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_X86=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
......@@ -163,11 +208,12 @@ steps:
depends_on: ~
- label: "Build release image - arm64 - CPU"
depends_on:
depends_on:
- block-arm64-cpu-release-image-build
- input-release-version
id: build-release-image-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
queue: arm64_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
......@@ -185,7 +231,7 @@ steps:
- build-release-image-arm64
id: create-multi-arch-manifest
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
......@@ -196,7 +242,7 @@ steps:
- create-multi-arch-manifest
id: annotate-release-workflow
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/annotate-release.sh"
......@@ -206,18 +252,67 @@ steps:
- build-release-image-arm64-cuda-13-0
id: create-multi-arch-manifest-cuda-13-0
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
- label: "Create multi-arch manifest - CUDA 12.9 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-ubuntu2404
- build-release-image-arm64-ubuntu2404
id: create-multi-arch-manifest-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
- label: "Create multi-arch manifest - CUDA 13.0 - Ubuntu 24.04"
depends_on:
- build-release-image-x86-cuda-13-0-ubuntu2404
- build-release-image-arm64-cuda-13-0-ubuntu2404
id: create-multi-arch-manifest-cuda-13-0-ubuntu2404
agents:
queue: small_cpu_queue_release
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404 --amend"
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
- block: "Confirm publishing release images to DockerHub"
key: block-publish-release-images-dockerhub
depends_on:
- create-multi-arch-manifest
- create-multi-arch-manifest-cuda-13-0
- build-release-image-x86-cpu
- build-release-image-arm64-cpu
- build-rocm-release-image
- label: "Publish release images to DockerHub"
key: publish-release-images-dockerhub
depends_on:
- block-publish-release-images-dockerhub
agents:
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-release-builds.sh"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
- label: "Publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
......@@ -235,7 +330,7 @@ steps:
- create-multi-arch-manifest-cuda-13-0
if: build.env("NIGHTLY") == "1"
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/push-nightly-builds.sh cu130"
# Clean up old nightly builds (keep only last 14)
......@@ -262,7 +357,7 @@ steps:
- block-upload-release-wheels
id: upload-release-wheels
agents:
queue: small_cpu_queue_postmerge
queue: small_cpu_queue_release
commands:
- "bash .buildkite/scripts/upload-release-wheels-pypi.sh"
......@@ -344,7 +439,7 @@ steps:
- step: input-rocm-config
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
# Set configuration and check cache
- |
......@@ -486,7 +581,7 @@ steps:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
timeout_in_minutes: 180
commands:
# Download artifacts and prepare Docker image
......@@ -596,7 +691,7 @@ steps:
- step: build-rocm-vllm-wheel
allow_failure: false
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
timeout_in_minutes: 60
commands:
# Download all wheel artifacts and run upload
......@@ -645,7 +740,7 @@ steps:
- step: input-release-version
allow_failure: true
agents:
queue: small_cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
......@@ -662,7 +757,7 @@ steps:
depends_on: block-generate-root-index-rocm-wheels
id: generate-root-index-rocm-wheels
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
commands:
- "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env:
......@@ -676,7 +771,7 @@ steps:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
queue: cpu_queue_release
timeout_in_minutes: 60
commands:
- |
......
#!/bin/bash
set -euo pipefail
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't always include tags)
echo "Fetching latest tags from origin..."
git fetch --tags --force origin
# Derive release version from the git tag on the current commit.
# The pipeline must be triggered on a tagged commit (e.g. v0.18.1).
RELEASE_VERSION=$(git describe --exact-match --tags "${BUILDKITE_COMMIT}" 2>/dev/null || true)
if [ -z "${RELEASE_VERSION}" ]; then
echo "[FATAL] Commit ${BUILDKITE_COMMIT} has no exact git tag. " \
"Release images must be published from a tagged commit."
exit 1
fi
# Strip leading 'v' for use in Docker tags (e.g. v0.18.1 -> 0.18.1)
PURE_VERSION="${RELEASE_VERSION#v}"
echo "========================================"
echo "Publishing release images"
echo " Commit: ${BUILDKITE_COMMIT}"
echo " Release version: ${RELEASE_VERSION}"
echo "========================================"
set -x
# ---- CUDA (default, CUDA 12.9) ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:latest-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
docker push "vllm/vllm-openai:latest-x86_64"
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:latest-aarch64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker push "vllm/vllm-openai:latest-aarch64"
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker manifest rm "vllm/vllm-openai:latest" || true
docker manifest create "vllm/vllm-openai:latest" "vllm/vllm-openai:latest-x86_64" "vllm/vllm-openai:latest-aarch64"
docker manifest push "vllm/vllm-openai:latest"
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}" || true
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}" "vllm/vllm-openai:v${PURE_VERSION}-x86_64" "vllm/vllm-openai:v${PURE_VERSION}-aarch64"
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}"
# ---- CUDA 13.0 ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:latest-x86_64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
docker push "vllm/vllm-openai:latest-x86_64-cu130"
docker push "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker push "vllm/vllm-openai:latest-aarch64-cu130"
docker push "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker manifest rm "vllm/vllm-openai:latest-cu130" || true
docker manifest create "vllm/vllm-openai:latest-cu130" "vllm/vllm-openai:latest-x86_64-cu130" "vllm/vllm-openai:latest-aarch64-cu130"
docker manifest push "vllm/vllm-openai:latest-cu130"
docker manifest rm "vllm/vllm-openai:v${PURE_VERSION}-cu130" || true
docker manifest create "vllm/vllm-openai:v${PURE_VERSION}-cu130" "vllm/vllm-openai:v${PURE_VERSION}-x86_64-cu130" "vllm/vllm-openai:v${PURE_VERSION}-aarch64-cu130"
docker manifest push "vllm/vllm-openai:v${PURE_VERSION}-cu130"
# ---- ROCm ----
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm"
docker pull "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:latest"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm" "vllm/vllm-openai-rocm:v${PURE_VERSION}"
docker push "vllm/vllm-openai-rocm:latest"
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:latest-base"
docker tag "public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
docker push "vllm/vllm-openai-rocm:latest-base"
docker push "vllm/vllm-openai-rocm:v${PURE_VERSION}-base"
# ---- CPU ----
# CPU images in ECR are tagged with the full version including 'v' (e.g. v0.18.1),
# matching the value from the Buildkite release-version metadata input.
docker pull "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}"
docker pull "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}"
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
docker push "vllm/vllm-openai-cpu:latest-x86_64"
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:latest-arm64"
docker tag "public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:${RELEASE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker push "vllm/vllm-openai-cpu:latest-arm64"
docker push "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker manifest rm "vllm/vllm-openai-cpu:latest" || true
docker manifest create "vllm/vllm-openai-cpu:latest" "vllm/vllm-openai-cpu:latest-x86_64" "vllm/vllm-openai-cpu:latest-arm64"
docker manifest push "vllm/vllm-openai-cpu:latest"
docker manifest rm "vllm/vllm-openai-cpu:v${PURE_VERSION}" || true
docker manifest create "vllm/vllm-openai-cpu:v${PURE_VERSION}" "vllm/vllm-openai-cpu:v${PURE_VERSION}-x86_64" "vllm/vllm-openai-cpu:v${PURE_VERSION}-arm64"
docker manifest push "vllm/vllm-openai-cpu:v${PURE_VERSION}"
echo "========================================"
echo "Successfully published release images for ${RELEASE_VERSION}"
echo "========================================"
......@@ -24,6 +24,7 @@
ARG CUDA_VERSION=12.9.1
ARG PYTHON_VERSION=3.12
ARG UBUNTU_VERSION=22.04
# By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in
......@@ -38,7 +39,7 @@ ARG PYTHON_VERSION=3.12
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
......@@ -111,6 +112,10 @@ RUN apt-get update -y \
gcc-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
# Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
# which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
# Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
&& (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
......@@ -507,7 +512,6 @@ RUN apt-get update -y \
software-properties-common \
curl \
sudo \
python3-pip \
ffmpeg \
libsm6 \
libxext6 \
......@@ -535,6 +539,7 @@ RUN apt-get update -y \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& rm -f /usr/lib/python${PYTHON_VERSION}/EXTERNALLY-MANAGED \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
......@@ -593,6 +598,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config
# Pre-download FlashInfer TRTLLM BMM headers for air-gapped environments.
# At runtime, MoE JIT compilation downloads these from edge.urm.nvidia.com
# which fails without internet. This step caches them at build time.
RUN python3 <<'PYEOF'
from flashinfer.jit import env as jit_env
from flashinfer.jit.cubin_loader import download_trtllm_headers, get_cubin
from flashinfer.artifacts import ArtifactPath, CheckSumHash
download_trtllm_headers(
'bmm',
jit_env.FLASHINFER_CUBIN_DIR / 'flashinfer' / 'trtllm' / 'batched_gemm' / 'trtllmGen_bmm_export',
f'{ArtifactPath.TRTLLM_GEN_BMM}/include/trtllmGen_bmm_export',
ArtifactPath.TRTLLM_GEN_BMM,
get_cubin(f'{ArtifactPath.TRTLLM_GEN_BMM}/checksums.txt', CheckSumHash.TRTLLM_GEN_BMM),
)
print('FlashInfer TRTLLM BMM headers downloaded successfully')
PYEOF
# ============================================================
# OPENAI API SERVER DEPENDENCIES
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
......
......@@ -33,6 +33,10 @@ group "default" {
targets = ["openai"]
}
group "all" {
targets = ["openai", "openai-ubuntu2404"]
}
# Base targets
target "_common" {
......@@ -74,3 +78,29 @@ target "openai" {
tags = ["vllm:openai"]
output = ["type=docker"]
}
# Ubuntu 24.04 targets
target "test-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "test"
tags = ["vllm:test-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
target "openai-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "vllm-openai"
tags = ["vllm:openai-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
......@@ -7,6 +7,9 @@
"PYTHON_VERSION": {
"default": "3.12"
},
"UBUNTU_VERSION": {
"default": "22.04"
},
"BUILD_BASE_IMAGE": {
"default": "nvidia/cuda:12.9.1-devel-ubuntu20.04"
},
......
model_name: "Qwen/Qwen3.5-35B-A3B"
accuracy_threshold: 0.86
accuracy_threshold: 0.84
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
......
model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
accuracy_threshold: 0.86
accuracy_threshold: 0.79
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
......
model_name: "nvidia/Qwen3.5-397B-A17B-NVFP4"
accuracy_threshold: 0.88
tolerance: 0.03
num_questions: 1319
num_fewshot: 5
server_args: >-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel
Qwen3.5-35B-A3B-DEP2.yaml
Qwen3.5-35B-A3B-FP8-DEP2.yaml
Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
......@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
from .gsm8k_eval import evaluate_gsm8k
TOL = 0.08 # Absolute tolerance for accuracy comparison
def run_gsm8k_eval(eval_config: dict, server_url: str) -> dict:
"""Run GSM8K evaluation using our isolated script."""
......@@ -109,20 +107,20 @@ def test_gsm8k_correctness(config_filename):
measured_metric = results["accuracy"]
expected_metric = eval_config["accuracy_threshold"]
tol = eval_config.get("tolerance", 0.08)
print(f"GSM8K Results for {eval_config['model_name']}:")
print(f" Measured metric: {measured_metric:.4f}")
print(f" Expected metric: {expected_metric:.4f}")
print(f" Tolerance: {TOL:.4f}")
print(f" Tolerance: {tol:.4f}")
print(f" Questions: {results['num_questions']}")
print(f" Invalid rate: {results['invalid_rate']:.3f}")
print(f" Latency: {results['latency']:.1f}s")
print(f" QPS: {results['questions_per_second']:.1f}")
# Verify metric is within tolerance
assert measured_metric >= expected_metric - TOL, (
assert measured_metric >= expected_metric - tol, (
f"GSM8K metric too low: {measured_metric:.4f} < "
f"{expected_metric:.4f} - {TOL:.4f} = {expected_metric - TOL:.4f}"
f"{expected_metric:.4f} - {tol:.4f} = {expected_metric - tol:.4f}"
)
print(f"✅ GSM8K test passed for {eval_config['model_name']}")
......@@ -791,6 +791,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
trust_remote_code=True,
revision="refs/pr/17",
),
"FireRedASR2ForConditionalGeneration": _HfExamplesInfo(
"allendou/FireRedASR2-LLM-vllm",
......
......@@ -30,7 +30,7 @@ class AttentionConfig:
use_cudnn_prefill: bool = False
"""Whether to use cudnn prefill."""
use_trtllm_ragged_deepseek_prefill: bool = False
use_trtllm_ragged_deepseek_prefill: bool = True
"""Whether to use TRTLLM ragged deepseek prefill."""
use_trtllm_attention: bool | None = None
......
......@@ -682,6 +682,25 @@ class VllmConfig:
self.model_config, self.load_config
)
if (
self.quant_config is not None
and self.model_config is not None
and hasattr(self.quant_config, "use_deep_gemm")
and self.quant_config.use_deep_gemm is None
):
from vllm.utils.deep_gemm import should_auto_disable_deep_gemm
model_type = getattr(self.model_config.hf_text_config, "model_type", None)
if should_auto_disable_deep_gemm(model_type):
self.quant_config.use_deep_gemm = False
logger.warning_once(
"Auto-disabled DeepGemm for model_type=%s on Blackwell. "
"DeepGemm E8M0 scale format causes accuracy degradation "
"for this architecture. Falling back to CUTLASS. "
"To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0.",
model_type,
)
from vllm.v1.executor.abstract import Executor
executor_backend = self.parallel_config.distributed_executor_backend
......
......@@ -135,6 +135,7 @@ class Fp8Config(QuantizationConfig):
f"{activation_scheme} activation scheme."
)
self.weight_block_size = weight_block_size
self.use_deep_gemm: bool | None = None
@classmethod
def get_name(cls) -> QuantizationMethods:
......@@ -291,7 +292,10 @@ class Fp8LinearMethod(LinearMethodBase):
self.use_marlin = False
self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled()
self.use_deep_gemm = is_deep_gemm_supported()
if self.quant_config.use_deep_gemm is not None:
self.use_deep_gemm = self.quant_config.use_deep_gemm
else:
self.use_deep_gemm = is_deep_gemm_supported()
self.weight_block_size = self.quant_config.weight_block_size
self.block_quant = self.weight_block_size is not None
......@@ -305,6 +309,7 @@ class Fp8LinearMethod(LinearMethodBase):
act_quant_group_shape=GroupShape(1, self.weight_block_size[0]),
cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
use_aiter_and_is_supported=self.use_aiter_and_is_supported,
use_deep_gemm=self.use_deep_gemm,
)
else:
# Use per-token quantization for better perf if dynamic and cutlass
......@@ -440,7 +445,7 @@ class Fp8LinearMethod(LinearMethodBase):
del layer.input_scale
return
if self.block_quant:
if self.block_quant and self.use_deep_gemm:
maybe_post_process_fp8_weight_block(layer)
def apply(
......
......@@ -91,6 +91,7 @@ class QuantFP8(CustomOp):
if (
self.is_group_quant
and self.use_ue8m0
and self.use_deep_gemm_supported
and (DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0)
):
......
......@@ -356,10 +356,14 @@ class W8A8BlockFp8LinearOp:
act_quant_group_shape: GroupShape,
cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
use_aiter_and_is_supported: bool = False,
use_deep_gemm: bool | None = None,
):
self.weight_group_shape = weight_group_shape
self.act_quant_group_shape = act_quant_group_shape
self.is_deep_gemm_supported = is_deep_gemm_supported()
if use_deep_gemm is not None:
self.is_deep_gemm_supported = use_deep_gemm
else:
self.is_deep_gemm_supported = is_deep_gemm_supported()
self.is_hopper = current_platform.is_device_capability(90)
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
self.is_flashinfer_supported = is_flashinfer_fp8_blockscale_gemm_supported()
......
......@@ -23,6 +23,24 @@ from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_gemm
from vllm.utils.math_utils import cdiv
_DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES: set[str] = {
"qwen3_5_text",
"qwen3_5_moe_text",
}
def should_auto_disable_deep_gemm(model_type: str | None) -> bool:
"""Check if DeepGemm should be auto-disabled for this model on Blackwell.
Returns True if the model is known to have accuracy degradation with
DeepGemm's E8M0 scale format on Blackwell GPUs (SM100+).
"""
if model_type is None:
return False
if not current_platform.is_device_capability_family(100):
return False
return model_type in _DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES
class DeepGemmQuantScaleFMT(Enum):
# Float32 scales in Float32 tensor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment