Unverified Commit 60af7b96 authored by TJian's avatar TJian Committed by GitHub
Browse files

[Releases] [ROCm] Enable Nightly Docker Image and Wheel Releases for ROCm (#37283)


Signed-off-by: default avatartjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: default avatarHongxia Yang <hongxiay.yang@amd.com>
parent bdc1719e
This diff is collapsed.
...@@ -8,6 +8,8 @@ if [ -z "${RELEASE_VERSION}" ]; then ...@@ -8,6 +8,8 @@ if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev" RELEASE_VERSION="1.0.0.dev"
fi fi
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit): To download the wheel (by commit):
\`\`\` \`\`\`
...@@ -33,7 +35,7 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 ...@@ -33,7 +35,7 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
...@@ -74,7 +76,7 @@ docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RE ...@@ -74,7 +76,7 @@ docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RE
docker push vllm/vllm-openai-rocm:latest docker push vllm/vllm-openai-rocm:latest
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base docker push vllm/vllm-openai-rocm:latest-base
......
...@@ -5,20 +5,21 @@ ...@@ -5,20 +5,21 @@
# Generate Buildkite annotation for ROCm wheel release # Generate Buildkite annotation for ROCm wheel release
set -ex set -ex
# Get build configuration from meta-data # Extract build configuration from Dockerfile.rocm_base (single source of truth)
# Extract ROCm version dynamically from Dockerfile.rocm_base # Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0" # BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown") ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12") PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
# TODO: Enable the nightly build for ROCm
# Get release version, default to 1.0.0.dev for nightly/per-commit builds # Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "") RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then if [ -z "${RELEASE_VERSION}" ]; then
RELEASE_VERSION="1.0.0.dev" RELEASE_VERSION="1.0.0.dev"
fi fi
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
# S3 URLs # S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}" S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}" S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
...@@ -96,7 +97,7 @@ To download and upload the image: ...@@ -96,7 +97,7 @@ To download and upload the image:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
docker push vllm/vllm-openai-rocm:latest-base docker push vllm/vllm-openai-rocm:latest-base
......
...@@ -15,8 +15,6 @@ ...@@ -15,8 +15,6 @@
# #
# Environment variables: # Environment variables:
# S3_BUCKET - S3 bucket name (default: vllm-wheels) # S3_BUCKET - S3 bucket name (default: vllm-wheels)
# PYTHON_VERSION - Python version (affects cache key)
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
# #
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base, # Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
# so changes to ROCm version are captured by the Dockerfile hash. # so changes to ROCm version are captured by the Dockerfile hash.
...@@ -36,13 +34,7 @@ generate_cache_key() { ...@@ -36,13 +34,7 @@ generate_cache_key() {
fi fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16) local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
# Include key build args that affect the output echo "${dockerfile_hash}"
# These should match the ARGs in Dockerfile.rocm_base that change the build output
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
echo "${dockerfile_hash}-${args_hash}"
} }
CACHE_KEY=$(generate_cache_key) CACHE_KEY=$(generate_cache_key)
...@@ -52,9 +44,6 @@ case "${1:-}" in ...@@ -52,9 +44,6 @@ case "${1:-}" in
check) check)
echo "Checking cache for key: ${CACHE_KEY}" >&2 echo "Checking cache for key: ${CACHE_KEY}" >&2
echo "Cache path: ${CACHE_PATH}" >&2 echo "Cache path: ${CACHE_PATH}" >&2
echo "Variables used in cache key:" >&2
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
# Check if cache exists by listing objects # Check if cache exists by listing objects
# We look for at least one .whl file # We look for at least one .whl file
...@@ -104,14 +93,16 @@ case "${1:-}" in ...@@ -104,14 +93,16 @@ case "${1:-}" in
echo "Cache key: ${CACHE_KEY}" echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}" echo "Cache path: ${CACHE_PATH}"
echo "" echo ""
mkdir -p artifacts/rocm-base-wheels mkdir -p artifacts/rocm-base-wheels
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
# Use sync with include/exclude to only download .whl files
aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \
--exclude "*" \
--include "*.whl"
echo "" echo ""
echo "Downloaded wheels:" echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \; find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo "" echo ""
echo "Total: $WHEEL_COUNT wheels" echo "Total: $WHEEL_COUNT wheels"
......
...@@ -4,16 +4,19 @@ set -ex ...@@ -4,16 +4,19 @@ set -ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix # This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] # Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-" # Example: cleanup-nightly-builds.sh "nightly-"
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"
# Get tag prefix from argument, default to "nightly-" if not provided # Get tag prefix and repo from arguments
TAG_PREFIX="${1:-nightly-}" TAG_PREFIX="${1:-nightly-}"
REPO="${2:-vllm/vllm-openai}"
echo "Cleaning up tags with prefix: $TAG_PREFIX" echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"
# DockerHub API endpoint for vllm/vllm-openai repository # DockerHub API endpoint for the repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"
# Get DockerHub credentials from environment # Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then if [ -z "$DOCKERHUB_TOKEN" ]; then
...@@ -70,7 +73,7 @@ delete_tag() { ...@@ -70,7 +73,7 @@ delete_tag() {
local tag_name="$1" local tag_name="$1"
echo "Deleting tag: $tag_name" echo "Deleting tag: $tag_name"
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name"
set +x set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x set -x
......
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Push ROCm nightly base image and nightly image from ECR
# to Docker Hub as vllm/vllm-openai-rocm:base-nightly and vllm/vllm-openai-rocm:nightly
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
#
# Local testing (no push to Docker Hub):
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.
set -ex
# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
DRY_RUN="${DRY_RUN:-0}"
# Get the base image ECR tag (set by build-rocm-release-image pipeline step)
BASE_ORIG_TAG="$(buildkite-agent meta-data get rocm-base-ecr-tag 2>/dev/null || echo "")"
if [ -z "$BASE_ORIG_TAG" ]; then
echo "WARNING: rocm-base-ecr-tag metadata not found, falling back to commit-based tag"
BASE_ORIG_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
fi
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
BASE_TAG_NAME="base-nightly"
TAG_NAME="nightly"
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"
echo "Pushing ROCm base image from ECR: $BASE_ORIG_TAG"
echo "Pushing ROCm release image from ECR tag: $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"
# Login to ECR and pull the image built by build-rocm-release-image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull "$BASE_ORIG_TAG"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"
# Tag for Docker Hub (base-nightly and base-nightly-<commit>, nightly and nightly-<commit>)
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
if [[ "$DRY_RUN" == "1" ]]; then
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
echo "[DRY_RUN] Local tags created. Exiting without push."
exit 0
fi
# Push to Docker Hub (docker-login plugin runs before this step in CI)
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment