Commit 6fa64fbe authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0rc2' into v0.14.0rc2-ori

parents 7aa5c03c 7f42dc20
...@@ -169,6 +169,24 @@ steps: ...@@ -169,6 +169,24 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build ROCm release image"
key: block-rocm-release-image-build
depends_on: ~
- label: "Build release image (ROCm)"
depends_on: block-rocm-release-image-build
id: build-release-image-rocm
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
# Build base image first
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --tag rocm/vllm-dev:base-$BUILDKITE_COMMIT --target final --progress plain -f docker/Dockerfile.rocm_base ."
# Build vLLM ROCm image using the base
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg BASE_IMAGE=rocm/vllm-dev:base-$BUILDKITE_COMMIT --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm --target vllm-openai --progress plain -f docker/Dockerfile.rocm ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-rocm"
- label: "Build and publish nightly multi-arch image to DockerHub" - label: "Build and publish nightly multi-arch image to DockerHub"
depends_on: depends_on:
- create-multi-arch-manifest - create-multi-arch-manifest
...@@ -196,3 +214,365 @@ steps: ...@@ -196,3 +214,365 @@ steps:
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot" DOCKERHUB_USERNAME: "vllmbot"
# =============================================================================
# ROCm Release Pipeline (x86_64 only)
# =============================================================================
#
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
# To build a specific version, trigger the build from that branch/tag.
#
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
#
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
#
# =============================================================================
# ROCm Input Step - Collect build configuration (manual trigger only)
- input: "ROCm Wheel Release Build Configuration"
key: input-rocm-config
depends_on: ~
if: build.source == "ui"
fields:
- text: "Python Version"
key: "rocm-python-version"
default: "3.12"
hint: "Python version (e.g., 3.12)"
- text: "GPU Architectures"
key: "rocm-pytorch-rocm-arch"
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
hint: "Semicolon-separated GPU architectures"
- select: "Upload Wheels to S3"
key: "rocm-upload-wheels"
default: "true"
options:
- label: "No - Build only (nightly/dev)"
value: "false"
- label: "Yes - Upload to S3 (release)"
value: "true"
- select: "Force Rebuild Base Wheels"
key: "rocm-force-rebuild"
default: "false"
hint: "Ignore S3 cache and rebuild base wheels from scratch"
options:
- label: "No - Use cached wheels if available"
value: "false"
- label: "Yes - Rebuild even if cache exists"
value: "true"
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
- label: ":rocm: Build ROCm Base Wheels"
id: build-rocm-base-wheels
depends_on:
- step: input-rocm-config
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
agents:
queue: cpu_queue_postmerge
commands:
# Set configuration and check cache
- |
set -euo pipefail
# Get values from meta-data (set by input step) or use defaults
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
# Check for force rebuild flag
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
fi
echo "========================================"
echo "ROCm Base Wheels Build Configuration"
echo "========================================"
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
echo "========================================"
# Save resolved config for later jobs
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
# Check S3 cache for pre-built wheels
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
echo ""
echo "Cache key: $${CACHE_KEY}"
echo "Cache path: $${CACHE_PATH}"
# Save cache key for downstream jobs
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
CACHE_STATUS="miss"
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
else
echo "Force rebuild requested, skipping cache check"
fi
if [ "$${CACHE_STATUS}" = "hit" ]; then
echo ""
echo "CACHE HIT! Downloading pre-built wheels..."
echo ""
.buildkite/scripts/cache-rocm-base-wheels.sh download
# Set the S3 path for the cached Docker image (for Job 2 to download)
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Mark that we used cache (for Docker image handling)
buildkite-agent meta-data set "rocm-used-cache" "true"
echo ""
echo "Cache download complete. Skipping Docker build."
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
else
echo ""
echo "CACHE MISS. Building from scratch..."
echo ""
# Build full base image (for later vLLM build)
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
# Build debs_wheel_release stage for wheel extraction
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
--target debs_wheel_release \
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
# Extract wheels from Docker image
mkdir -p artifacts/rocm-base-wheels
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
docker rm $${container_id}
echo "Extracted base wheels:"
ls -lh artifacts/rocm-base-wheels/
# Upload wheels to S3 cache for future builds
echo ""
echo "Uploading wheels to S3 cache..."
.buildkite/scripts/cache-rocm-base-wheels.sh upload
# Export base Docker image for reuse in vLLM build
mkdir -p artifacts/rocm-docker-image
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
echo "Docker image size:"
ls -lh artifacts/rocm-docker-image/
# Upload large Docker image to S3 (also cached by cache key)
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Save the S3 path for downstream jobs
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
# Mark that we did NOT use cache
buildkite-agent meta-data set "rocm-used-cache" "false"
echo ""
echo "Build complete. Wheels cached for future builds."
fi
artifact_paths:
- "artifacts/rocm-base-wheels/*.whl"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 2: Build vLLM ROCm Wheel
- label: ":python: Build vLLM ROCm Wheel"
id: build-rocm-vllm-wheel
depends_on:
- step: build-rocm-base-wheels
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 180
commands:
# Download artifacts and prepare Docker image
- |
set -euo pipefail
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
# This fixes version detection when tags are moved/force-pushed
echo "Fetching latest tags from origin..."
git fetch --tags --force origin
# Log tag information for debugging version detection
echo "========================================"
echo "Git Tag Verification"
echo "========================================"
echo "Current HEAD: $(git rev-parse HEAD)"
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
echo ""
echo "Recent tags (pointing to commits near HEAD):"
git tag -l --sort=-creatordate | head -5
echo "setuptools_scm version detection:"
pip install -q setuptools_scm 2>/dev/null || true
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
echo "========================================"
# Download wheel artifacts from current build
echo "Downloading wheel artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
# Download Docker image from S3 (too large for Buildkite artifacts)
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
echo "ERROR: rocm-docker-image-s3-path metadata not found"
echo "This should have been set by the build-rocm-base-wheels job"
exit 1
fi
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
mkdir -p artifacts/rocm-docker-image
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
# Load base Docker image and capture the tag
echo "Loading base Docker image..."
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
echo "$${LOAD_OUTPUT}"
# Extract the actual loaded image tag from "Loaded image: <tag>" output
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
if [ -z "$${BASE_IMAGE_TAG}" ]; then
echo "ERROR: Failed to extract image tag from docker load output"
echo "Load output was: $${LOAD_OUTPUT}"
exit 1
fi
echo "Loaded base image: $${BASE_IMAGE_TAG}"
# Prepare base wheels for Docker build context
mkdir -p docker/context/base-wheels
touch docker/context/base-wheels/.keep
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
echo "Base wheels for vLLM build:"
ls -lh docker/context/base-wheels/
# Get GPU architectures from meta-data
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
echo "========================================"
echo "Building vLLM wheel with:"
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
echo "========================================"
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
DOCKER_BUILDKIT=1 docker build \
--file docker/Dockerfile.rocm \
--target export_vllm_wheel_release \
--output type=local,dest=rocm-dist \
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg REMOTE_VLLM=0 \
--build-arg GIT_REPO_CHECK=1 \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
.
echo "Built vLLM wheel:"
ls -lh rocm-dist/*.whl
# Copy wheel to artifacts directory
mkdir -p artifacts/rocm-vllm-wheel
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
echo "Final vLLM wheel:"
ls -lh artifacts/rocm-vllm-wheel/
artifact_paths:
- "artifacts/rocm-vllm-wheel/*.whl"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 3: Upload Wheels to S3
- label: ":s3: Upload ROCm Wheels to S3"
id: upload-rocm-wheels
depends_on:
- step: build-rocm-vllm-wheel
allow_failure: false
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
# Download all wheel artifacts and run upload
- |
set -euo pipefail
# Check if upload is enabled (from env var, meta-data, or release branch)
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
# Try to get from meta-data (input form)
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
fi
echo "========================================"
echo "Upload check:"
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
echo "========================================"
# Skip upload if not enabled
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
exit 0
fi
echo "Upload enabled, proceeding..."
# Download artifacts from current build
echo "Downloading artifacts from current build"
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
# Run upload script
bash .buildkite/scripts/upload-rocm-wheels.sh
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"
# ROCm Job 4: Annotate ROCm Wheel Release
- label: ":memo: Annotate ROCm wheel release"
id: annotate-rocm-release
depends_on:
- step: upload-rocm-wheels
allow_failure: true
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-rocm-release.sh"
env:
S3_BUCKET: "vllm-wheels"
...@@ -32,6 +32,7 @@ To download and upload the image: ...@@ -32,6 +32,7 @@ To download and upload the image:
\`\`\` \`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
...@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 ...@@ -45,6 +46,12 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai:rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:latest-rocm
docker tag vllm/vllm-openai:rocm vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker push vllm/vllm-openai:latest-rocm
docker push vllm/vllm-openai:v${RELEASE_VERSION}-rocm
docker manifest rm vllm/vllm-openai:latest docker manifest rm vllm/vllm-openai:latest
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
......
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Generate Buildkite annotation for ROCm wheel release
set -ex
# Get build configuration from meta-data
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
# S3 URLs
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
## :rocm: ROCm Wheel Release
### Build Configuration
| Setting | Value |
|---------|-------|
| **ROCm Version** | ${ROCM_VERSION} |
| **Python Version** | ${PYTHON_VERSION} |
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
| **Branch** | \`${BUILDKITE_BRANCH}\` |
| **Commit** | \`${BUILDKITE_COMMIT}\` |
### :package: Installation
**Install from this build (by commit):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
# Example:
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
\`\`\`
**Install from nightly (if published):**
\`\`\`bash
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
\`\`\`
### :floppy_disk: Download Wheels Directly
\`\`\`bash
# List all ROCm wheels
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
# Download specific wheels
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
\`\`\`
### :gear: Included Packages
- **vllm**: vLLM with ROCm support
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
- **triton_rocm**: Triton built for ROCm
- **torchvision**: TorchVision for ROCm PyTorch
- **amdsmi**: AMD SMI Python bindings
### :warning: Notes
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
- Platform: Linux x86_64 only
EOF
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Cache helper for ROCm base wheels
#
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
#
# Usage:
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
# cache-rocm-base-wheels.sh download - Download wheels from cache
# cache-rocm-base-wheels.sh key - Output the cache key
#
# Environment variables:
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
# PYTHON_VERSION - Python version (affects cache key)
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
#
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
# so changes to ROCm version are captured by the Dockerfile hash.
set -euo pipefail
BUCKET="${S3_BUCKET:-vllm-wheels}"
DOCKERFILE="docker/Dockerfile.rocm_base"
CACHE_PREFIX="rocm/cache"
# Generate hash from Dockerfile content + build args
generate_cache_key() {
# Include Dockerfile content
if [[ ! -f "$DOCKERFILE" ]]; then
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
exit 1
fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
# Include key build args that affect the output
# These should match the ARGs in Dockerfile.rocm_base that change the build output
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
echo "${dockerfile_hash}-${args_hash}"
}
CACHE_KEY=$(generate_cache_key)
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
case "${1:-}" in
check)
echo "Checking cache for key: ${CACHE_KEY}" >&2
echo "Cache path: ${CACHE_PATH}" >&2
echo "Variables used in cache key:" >&2
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
# Check if cache exists by listing objects
# We look for at least one .whl file
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
echo "S3 ls output:" >&2
echo "$S3_OUTPUT" | head -5 >&2
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
echo "hit"
else
echo "miss"
fi
;;
upload)
echo "========================================"
echo "Uploading wheels to cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
exit 1
fi
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
exit 1
fi
echo "Uploading $WHEEL_COUNT wheels..."
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
echo ""
echo "Cache upload complete!"
echo "========================================"
;;
download)
echo "========================================"
echo "Downloading wheels from cache"
echo "========================================"
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""
mkdir -p artifacts/rocm-base-wheels
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
echo ""
echo "Downloaded wheels:"
ls -lh artifacts/rocm-base-wheels/
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
echo "========================================"
;;
key)
echo "${CACHE_KEY}"
;;
path)
echo "${CACHE_PATH}"
;;
*)
echo "Usage: $0 {check|upload|download|key|path}" >&2
echo "" >&2
echo "Commands:" >&2
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
echo " key - Output the cache key" >&2
echo " path - Output the full S3 cache path" >&2
exit 1
;;
esac
...@@ -16,6 +16,18 @@ from urllib.parse import quote ...@@ -16,6 +16,18 @@ from urllib.parse import quote
import regex as re import regex as re
def normalize_package_name(name: str) -> str:
"""
Normalize package name according to PEP 503.
https://peps.python.org/pep-0503/#normalized-names
Replace runs of underscores, hyphens, and periods with a single hyphen,
and lowercase the result.
"""
return re.sub(r"[-_.]+", "-", name).lower()
if not sys.version_info >= (3, 12): if not sys.version_info >= (3, 12):
raise RuntimeError("This script requires Python 3.12 or higher.") raise RuntimeError("This script requires Python 3.12 or higher.")
...@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo: ...@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
version = version.removesuffix("." + variant) version = version.removesuffix("." + variant)
else: else:
if "+" in version: if "+" in version:
version, variant = version.split("+") version_part, suffix = version.split("+", 1)
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
# Git hashes and other suffixes are NOT variants
if suffix.startswith(("rocm", "cu", "cpu")):
variant = suffix
version = version_part
# Otherwise keep the full version string (variant stays None)
return WheelFileInfo( return WheelFileInfo(
package_name=package_name, package_name=package_name,
...@@ -206,6 +224,26 @@ def generate_index_and_metadata( ...@@ -206,6 +224,26 @@ def generate_index_and_metadata(
print("No wheel files found, skipping index generation.") print("No wheel files found, skipping index generation.")
return return
# For ROCm builds: inherit variant from vllm wheel
# All ROCm wheels should share the same variant as vllm
rocm_variant = None
for file in parsed_files:
if (
file.package_name == "vllm"
and file.variant
and file.variant.startswith("rocm")
):
rocm_variant = file.variant
print(f"Detected ROCm variant from vllm: {rocm_variant}")
break
# Apply ROCm variant to all wheels without a variant
if rocm_variant:
for file in parsed_files:
if file.variant is None:
file.variant = rocm_variant
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
# Group by variant # Group by variant
variant_to_files: dict[str, list[WheelFileInfo]] = {} variant_to_files: dict[str, list[WheelFileInfo]] = {}
for file in parsed_files: for file in parsed_files:
...@@ -256,8 +294,8 @@ def generate_index_and_metadata( ...@@ -256,8 +294,8 @@ def generate_index_and_metadata(
variant_dir.mkdir(parents=True, exist_ok=True) variant_dir.mkdir(parents=True, exist_ok=True)
# gather all package names in this variant # gather all package names in this variant (normalized per PEP 503)
packages = set(f.package_name for f in files) packages = set(normalize_package_name(f.package_name) for f in files)
if variant == "default": if variant == "default":
# these packages should also appear in the "project list" # these packages should also appear in the "project list"
# generate after all variants are processed # generate after all variants are processed
...@@ -269,8 +307,10 @@ def generate_index_and_metadata( ...@@ -269,8 +307,10 @@ def generate_index_and_metadata(
f.write(project_list_str) f.write(project_list_str)
for package in packages: for package in packages:
# filter files belonging to this package only # filter files belonging to this package only (compare normalized names)
package_files = [f for f in files if f.package_name == package] package_files = [
f for f in files if normalize_package_name(f.package_name) == package
]
package_dir = variant_dir / package package_dir = variant_dir / package
package_dir.mkdir(parents=True, exist_ok=True) package_dir.mkdir(parents=True, exist_ok=True)
index_str, metadata_str = generate_package_index_and_metadata( index_str, metadata_str = generate_package_index_and_metadata(
...@@ -341,8 +381,13 @@ if __name__ == "__main__": ...@@ -341,8 +381,13 @@ if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
version = args.version version = args.version
if "/" in version or "\\" in version: # Allow rocm/ prefix, reject other slashes and all backslashes
raise ValueError("Version string must not contain slashes.") if "\\" in version:
raise ValueError("Version string must not contain backslashes.")
if "/" in version and not version.startswith("rocm/"):
raise ValueError(
"Version string must not contain slashes (except for 'rocm/' prefix)."
)
current_objects_path = Path(args.current_objects) current_objects_path = Path(args.current_objects)
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
if not output_dir.exists(): if not output_dir.exists():
...@@ -393,8 +438,23 @@ if __name__ == "__main__": ...@@ -393,8 +438,23 @@ if __name__ == "__main__":
# Generate index and metadata, assuming wheels and indices are stored as: # Generate index and metadata, assuming wheels and indices are stored as:
# s3://vllm-wheels/{wheel_dir}/<wheel files> # s3://vllm-wheels/{wheel_dir}/<wheel files>
# s3://vllm-wheels/<anything>/<index files> # s3://vllm-wheels/<anything>/<index files>
wheel_dir = args.wheel_dir or version #
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
# - rocm/{commit}/ (same as wheels)
# - rocm/nightly/
# - rocm/{version}/
# All these are under the "rocm/" prefix, so relative paths should be
# relative to "rocm/", not the bucket root.
if args.wheel_dir:
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
wheel_dir = args.wheel_dir.strip().rstrip("/")
elif version.startswith("rocm/"):
# For rocm/commit, wheel_base_dir should be just the commit part
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
wheel_dir = version.split("/", 1)[1]
else:
wheel_dir = version
wheel_base_dir = Path(output_dir).parent / wheel_dir
index_base_dir = Path(output_dir) index_base_dir = Path(output_dir)
generate_index_and_metadata( generate_index_and_metadata(
......
#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Upload ROCm wheels to S3 with proper index generation
#
# Required environment variables:
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
# S3_BUCKET (default: vllm-wheels)
#
# S3 path structure:
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
set -ex
# ======== Configuration ========
BUCKET="${S3_BUCKET:-vllm-wheels}"
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
INDICES_OUTPUT_DIR="rocm-indices"
PYTHON="${PYTHON_PROG:-python3}"
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
MANYLINUX_VERSION="manylinux_2_35"
echo "========================================"
echo "ROCm Wheel Upload Configuration"
echo "========================================"
echo "S3 Bucket: $BUCKET"
echo "S3 Path: $ROCM_SUBPATH"
echo "Commit: $BUILDKITE_COMMIT"
echo "Branch: $BUILDKITE_BRANCH"
echo "========================================"
# ======== Part 0: Setup Python ========
# Detect if python3.12+ is available
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
if [[ "$has_new_python" -eq 0 ]]; then
# Use new python from docker
# Use --user to ensure files are created with correct ownership (not root)
docker pull python:3-slim
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
fi
echo "Using python interpreter: $PYTHON"
echo "Python version: $($PYTHON --version)"
# ======== Part 1: Collect and prepare wheels ========
# Collect all wheels
mkdir -p all-rocm-wheels
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
echo "Total wheels to upload: $WHEEL_COUNT"
if [ "$WHEEL_COUNT" -eq 0 ]; then
echo "ERROR: No wheels found to upload!"
exit 1
fi
# Rename linux to manylinux in wheel filenames
for wheel in all-rocm-wheels/*.whl; do
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
mv -- "$wheel" "$new_wheel"
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
fi
done
echo ""
echo "Wheels to upload:"
ls -lh all-rocm-wheels/
# ======== Part 2: Upload wheels to S3 ========
echo ""
echo "Uploading wheels to $S3_COMMIT_PREFIX"
for wheel in all-rocm-wheels/*.whl; do
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
done
# ======== Part 3: Generate and upload indices ========
# List existing wheels in commit directory
echo ""
echo "Generating indices..."
obj_json="rocm-objects.json"
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
mkdir -p "$INDICES_OUTPUT_DIR"
# Use the existing generate-nightly-index.py
# HACK: Replace regex module with stdlib re (same as CUDA script)
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
$PYTHON .buildkite/scripts/generate-nightly-index.py \
--version "$ROCM_SUBPATH" \
--current-objects "$obj_json" \
--output-dir "$INDICES_OUTPUT_DIR" \
--comment "ROCm commit $BUILDKITE_COMMIT"
# Upload indices to commit directory
echo "Uploading indices to $S3_COMMIT_PREFIX"
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
# Update rocm/nightly/ if on main branch and not a PR
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Updating rocm/nightly/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
fi
# Extract version from vLLM wheel and update version-specific index
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
if [ -n "$VLLM_WHEEL" ]; then
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version in wheel: $VERSION"
PURE_VERSION="${VERSION%%+*}"
PURE_VERSION="${PURE_VERSION%%.rocm}"
echo "Pure version: $PURE_VERSION"
if [[ "$VERSION" != *"dev"* ]]; then
echo "Updating rocm/$PURE_VERSION/ index..."
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
fi
fi
# ======== Part 4: Summary ========
echo ""
echo "========================================"
echo "ROCm Wheel Upload Complete!"
echo "========================================"
echo ""
echo "Wheels available at:"
echo " s3://$BUCKET/$ROCM_SUBPATH/"
echo ""
echo "Install command (by commit):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
echo ""
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
echo "Install command (nightly):"
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
fi
echo ""
echo "Wheel count: $WHEEL_COUNT"
echo "========================================"
...@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0" ...@@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base ARG BASE_IMAGE=rocm/vllm-dev:base
# Sccache configuration (only used in release pipeline)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH ARG ARG_PYTORCH_ROCM_ARCH
...@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 ...@@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
ARG USE_SCCACHE
RUN if [ "$USE_SCCACHE" != "1" ]; then \
apt-get purge -y sccache || true; \
python3 -m pip uninstall -y sccache || true; \
rm -f "$(which sccache)" || true; \
fi
# Install UV # Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
...@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ...@@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts # Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
if command -v sccache >/dev/null 2>&1; then \
echo "sccache already installed, skipping installation"; \
sccache --version; \
else \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR} WORKDIR ${COMMON_WORKDIR}
...@@ -51,7 +97,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm ...@@ -51,7 +97,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# ----------------------- # -----------------------
# vLLM build stages # vLLM build stages
FROM fetch_vllm AS build_vllm FROM fetch_vllm AS build_vllm
# Build vLLM # Build vLLM (setup.py auto-detects sccache in PATH)
RUN cd vllm \ RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \ && python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
...@@ -67,6 +113,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ ...@@ -67,6 +113,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# RIXL/UCX build stages
FROM base AS build_rixl
ARG RIXL_BRANCH="f33a5599"
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
ARG UCX_BRANCH="da3fac2a"
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench
# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
libcpprest-dev \
libaio-dev \
librdmacm1 \
librdmacm-dev \
libibverbs1 \
libibverbs-dev \
ibverbs-utils \
rdmacm-utils \
ibverbs-providers \
&& rm -rf /var/lib/apt/lists/*
RUN uv pip install --system meson auditwheel patchelf tomlkit
RUN cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-devel-headers \
--with-rocm=/opt/rocm \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja install
# Generate RIXL wheel
RUN cd /opt/rixl && mkdir -p /app/install && \
./contrib/build-wheel.sh \
--output-dir /app/install \
--rocm-dir ${ROCM_PATH} \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
# -----------------------
# vLLM wheel release build stage (for building distributable wheels)
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
FROM fetch_vllm AS build_vllm_wheel_release
ARG COMMON_WORKDIR
# Create /install directory for custom wheels
RUN mkdir -p /install
# Copy custom ROCm wheels from docker/context if they exist
# COPY ensures Docker cache is invalidated when wheels change
# .keep file ensures directory always exists for COPY to work
COPY docker/context/base-wheels/ /tmp/base-wheels/
# This is how we know if we are building for a wheel release or not.
# If there are not wheels found there, we are not building for a wheel release.
# So we exit with an error. To skip this stage.
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
echo "Found custom wheels - copying to /install"; \
cp /tmp/base-wheels/*.whl /install/ && \
echo "Copied custom wheels:"; \
ls -lh /install/; \
else \
echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
echo "Wheel releases require pre-built ROCm wheels."; \
exit 1; \
fi
# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
ARG GIT_REPO_CHECK=0
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
echo "Running repository checks..."; \
cd vllm && bash tools/check_repo.sh; \
fi
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
# This ensures setuptools_scm sees clean repo state for version detection
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& pip install setuptools_scm \
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
# Fail if git-based package dependencies are found in requirements files
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
RUN echo "Checking for git-based packages in requirements files..." \
&& echo "Checking common.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
echo "ERROR: Git-based packages found in common.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in common.txt"; \
fi \
&& echo "Checking rocm.txt for git-based packages:" \
&& if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
echo "ERROR: Git-based packages found in rocm.txt:"; \
grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
echo "Please publish these packages to PyPI instead of using git dependencies."; \
exit 1; \
else \
echo " ✓ No git-based packages found in rocm.txt"; \
fi \
&& echo "All requirements files are clean - no git-based packages found"
# Pin vLLM dependencies to exact versions of custom ROCm wheels
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
# Install dependencies using custom wheels from /install
RUN cd vllm \
&& echo "Building vLLM with custom wheels from /install" \
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
&& python3 setup.py clean --all
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
# (setup.py auto-detects sccache in PATH)
RUN --mount=type=bind,source=.git,target=vllm/.git \
cd vllm \
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm_wheel_release
ARG COMMON_WORKDIR
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# ----------------------- # -----------------------
# Test vLLM image # Test vLLM image
FROM base AS test FROM base AS test
...@@ -159,3 +377,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf" ...@@ -159,3 +377,7 @@ ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt
CMD ["/bin/bash"] CMD ["/bin/bash"]
#Set entrypoint for vllm-openai official images
FROM final As vllm-openai
ENTRYPOINT ["vllm", "serve"]
...@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git" ...@@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git"
ARG MORI_BRANCH="2d02c6a9" ARG MORI_BRANCH="2d02c6a9"
ARG MORI_REPO="https://github.com/ROCm/mori.git" ARG MORI_REPO="https://github.com/ROCm/mori.git"
#TODO: When patch has been upstreamed, switch to the main repo/branch # Sccache configuration (only used in release pipeline)
# ARG RIXL_BRANCH="<TODO>" ARG USE_SCCACHE
# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git" ARG SCCACHE_DOWNLOAD_URL
ARG RIXL_BRANCH="50d63d94" ARG SCCACHE_ENDPOINT
ARG RIXL_REPO="https://github.com/vcave/RIXL.git" ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
# Needed by RIXL ARG SCCACHE_REGION_NAME=us-west-2
ARG ETCD_BRANCH="7c6e714f" ARG SCCACHE_S3_NO_CREDENTIALS=0
ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git"
ARG UCX_BRANCH="da3fac2a"
ARG UCX_REPO="https://github.com/ROCm/ucx.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
...@@ -64,6 +61,49 @@ RUN apt-get update -y \ ...@@ -64,6 +61,49 @@ RUN apt-get update -y \
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
&& SCCACHE_ARCH="x86_64" \
&& SCCACHE_VERSION="v0.8.1" \
&& SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
&& curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
&& tar -xzf /tmp/sccache.tar.gz -C /tmp \
&& mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
&& chmod +x /usr/bin/sccache \
&& rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
&& sccache --version; \
fi
# Setup sccache for HIP compilation via HIP_CLANG_PATH
# This creates wrapper scripts in a separate directory and points HIP to use them
# This avoids modifying the original ROCm binaries which can break detection
# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm)
# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1
RUN if [ "$USE_SCCACHE" = "1" ]; then \
echo "Setting up sccache wrappers for HIP compilation..." \
&& mkdir -p /opt/sccache-wrappers \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \
&& chmod +x /opt/sccache-wrappers/clang++ \
&& printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \
&& chmod +x /opt/sccache-wrappers/clang \
&& echo "sccache wrappers created in /opt/sccache-wrappers"; \
fi
# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
### ###
### Triton Build ### Triton Build
...@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH ...@@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO ARG PYTORCH_AUDIO_REPO
ARG USE_SCCACHE
RUN git clone ${PYTORCH_REPO} pytorch RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \ RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
&& pip install -r requirements.txt && git submodule update --init --recursive \ && pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \ && python3 tools/amd_build/build_amd.py \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache \
&& sccache --show-stats; \
fi \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \ RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& pip install -r requirements.txt \ && pip install -r requirements.txt \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& export CMAKE_C_COMPILER_LAUNCHER=sccache \
&& export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \
fi \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& pip install dist/*.whl && pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \
...@@ -230,13 +290,19 @@ RUN cd /opt/rixl && mkdir -p /app/install && \ ...@@ -230,13 +290,19 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
FROM base AS build_fa FROM base AS build_fa
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN git clone ${FA_REPO} RUN git clone ${FA_REPO}
RUN cd flash-attention \ RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \ && git checkout ${FA_BRANCH} \
&& git submodule update --init \ && git submodule update --init \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist && if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi
RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
...@@ -246,6 +312,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install ...@@ -246,6 +312,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install
FROM base AS build_aiter FROM base AS build_aiter
ARG AITER_BRANCH ARG AITER_BRANCH
ARG AITER_REPO ARG AITER_REPO
ARG USE_SCCACHE
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl pip install /install/*.whl
RUN git clone --recursive ${AITER_REPO} RUN git clone --recursive ${AITER_REPO}
...@@ -253,13 +320,37 @@ RUN cd aiter \ ...@@ -253,13 +320,37 @@ RUN cd aiter \
&& git checkout ${AITER_BRANCH} \ && git checkout ${AITER_BRANCH} \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& pip install -r requirements.txt && pip install -r requirements.txt
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl RUN pip install pyyaml && cd aiter \
&& if [ "$USE_SCCACHE" = "1" ]; then \
export HIP_CLANG_PATH=/opt/sccache-wrappers \
&& sccache --show-stats; \
fi \
&& PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \
&& if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \
&& ls /app/aiter/dist/*.whl
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
### ###
### Final Build ### Final Build
### ###
# Wheel release stage -
# only includes dependencies used by wheel release pipeline
FROM base AS debs_wheel_release
RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
cp /install/*.whl /app/debs
# Full debs stage - includes Mori (used by Docker releases)
FROM base AS debs FROM base AS debs
RUN mkdir /app/debs RUN mkdir /app/debs
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
......
...@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`: ...@@ -47,7 +47,7 @@ The key parameters for chunked processing are in the `--pooler-config`:
```json ```json
{ {
"pooling_type": "auto", "pooling_type": "auto",
"normalize": true, "use_activation": true,
"enable_chunked_processing": true, "enable_chunked_processing": true,
"max_embed_len": 3072000 "max_embed_len": 3072000
} }
......
...@@ -14,7 +14,7 @@ Prerequisites: ...@@ -14,7 +14,7 @@ Prerequisites:
# MEAN pooling (processes all chunks, recommended for complete coverage) # MEAN pooling (processes all chunks, recommended for complete coverage)
vllm serve intfloat/multilingual-e5-large \ vllm serve intfloat/multilingual-e5-large \
--pooler-config \ --pooler-config \
'{"pooling_type": "MEAN", "normalize": true, ' \ '{"pooling_type": "MEAN", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 3072000}' \ '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
--served-model-name multilingual-e5-large \ --served-model-name multilingual-e5-large \
--trust-remote-code \ --trust-remote-code \
...@@ -24,7 +24,7 @@ Prerequisites: ...@@ -24,7 +24,7 @@ Prerequisites:
# OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks) # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
vllm serve BAAI/bge-large-en-v1.5 \ vllm serve BAAI/bge-large-en-v1.5 \
--pooler-config \ --pooler-config \
'{"pooling_type": "CLS", "normalize": true, ' \ '{"pooling_type": "CLS", "use_activation": true, ' \
'"enable_chunked_processing": true, "max_embed_len": 1048576}' \ '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
--served-model-name bge-large-en-v1.5 \ --served-model-name bge-large-en-v1.5 \
--trust-remote-code \ --trust-remote-code \
......
...@@ -96,7 +96,7 @@ echo "" ...@@ -96,7 +96,7 @@ echo ""
echo "🔧 Starting server with enhanced chunked processing configuration..." echo "🔧 Starting server with enhanced chunked processing configuration..."
# Build pooler config JSON # Build pooler config JSON
POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"use_activation\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
# Start vLLM server with enhanced chunked processing # Start vLLM server with enhanced chunked processing
vllm serve "$MODEL_NAME" \ vllm serve "$MODEL_NAME" \
......
...@@ -80,6 +80,8 @@ num2words==0.5.14 ...@@ -80,6 +80,8 @@ num2words==0.5.14
pqdm==0.2.0 pqdm==0.2.0
# via lm-eval # via lm-eval
# Required for fastsafetensors test
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
# Required for suffix decoding test # Required for suffix decoding test
arctic-inference == 0.1.1 arctic-inference == 0.1.1
# Required for Nemotron test # Required for Nemotron test
......
...@@ -15,5 +15,5 @@ setuptools-scm>=8 ...@@ -15,5 +15,5 @@ setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3 runai-model-streamer[s3,gcs]==0.15.3
# conch-triton-kernels==1.2.1 # conch-triton-kernels==1.2.1
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 timm>=1.0.17
grpcio-tools>=1.76.0 grpcio-tools>=1.76.0
\ No newline at end of file
...@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM): ...@@ -53,7 +53,9 @@ def test_token_embed(llm: LLM):
def test_pooling_params(llm: LLM): def test_pooling_params(llm: LLM):
def get_outputs(normalize): def get_outputs(normalize):
outputs = llm.embed( outputs = llm.embed(
prompts, pooling_params=PoolingParams(normalize=normalize), use_tqdm=False prompts,
pooling_params=PoolingParams(use_activation=normalize),
use_tqdm=False,
) )
return torch.tensor([x.outputs.embedding for x in outputs]) return torch.tensor([x.outputs.embedding for x in outputs])
......
...@@ -216,7 +216,7 @@ def server_with_chunked_processing(): ...@@ -216,7 +216,7 @@ def server_with_chunked_processing():
"512", # Set smaller max_model_len to trigger chunking mechanism "512", # Set smaller max_model_len to trigger chunking mechanism
"--pooler-config", "--pooler-config",
( (
'{"pooling_type": "MEAN", "normalize": true, ' '{"pooling_type": "MEAN", "use_activation": true, '
'"enable_chunked_processing": true, "max_embed_len": 10000}' '"enable_chunked_processing": true, "max_embed_len": 10000}'
), ),
"--gpu-memory-utilization", "--gpu-memory-utilization",
......
...@@ -236,17 +236,14 @@ class TestModel: ...@@ -236,17 +236,14 @@ class TestModel:
"use_activation": use_activation, "use_activation": use_activation,
}, },
) )
if response.status_code != 200:
return response
outputs = response.json() outputs = response.json()
return torch.tensor([x["score"] for x in outputs["data"]]) return torch.tensor([x["score"] for x in outputs["data"]])
if model["is_cross_encoder"]: default = get_outputs(use_activation=None)
default = get_outputs(use_activation=None) w_activation = get_outputs(use_activation=True)
w_activation = get_outputs(use_activation=True) wo_activation = get_outputs(use_activation=False)
wo_activation = get_outputs(use_activation=False)
if model["is_cross_encoder"]:
assert torch.allclose(default, w_activation, atol=1e-2), ( assert torch.allclose(default, w_activation, atol=1e-2), (
"Default should use activation." "Default should use activation."
) )
...@@ -256,9 +253,3 @@ class TestModel: ...@@ -256,9 +253,3 @@ class TestModel:
assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), ( assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
"w_activation should be close to activation(wo_activation)." "w_activation should be close to activation(wo_activation)."
) )
else:
get_outputs(use_activation=None)
# The activation parameter only works for the is_cross_encoder model
response = get_outputs(use_activation=True)
assert response.status_code == 400
...@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -48,7 +48,7 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
# asserts on the pooling config files # asserts on the pooling config files
assert model_config.pooler_config.seq_pooling_type == "CLS" assert model_config.pooler_config.seq_pooling_type == "CLS"
assert model_config.pooler_config.tok_pooling_type == "ALL" assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.normalize assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded # asserts on the tokenizer loaded
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
...@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -93,7 +93,7 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
# asserts on the pooling config files # asserts on the pooling config files
assert model_config.pooler_config.seq_pooling_type == "MEAN" assert model_config.pooler_config.seq_pooling_type == "MEAN"
assert model_config.pooler_config.tok_pooling_type == "ALL" assert model_config.pooler_config.tok_pooling_type == "ALL"
assert model_config.pooler_config.normalize assert model_config.pooler_config.use_activation
# asserts on the tokenizer loaded # asserts on the tokenizer loaded
assert model_config.tokenizer == "intfloat/multilingual-e5-base" assert model_config.tokenizer == "intfloat/multilingual-e5-base"
......
...@@ -66,7 +66,7 @@ def test_embed_models_using_normalize( ...@@ -66,7 +66,7 @@ def test_embed_models_using_normalize(
model, model,
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
pooler_config=PoolerConfig(normalize=False), pooler_config=PoolerConfig(use_activation=False),
) as vllm_model: ) as vllm_model:
wo_normalize = torch.tensor(vllm_model.embed(example_prompts)) wo_normalize = torch.tensor(vllm_model.embed(example_prompts))
...@@ -74,7 +74,7 @@ def test_embed_models_using_normalize( ...@@ -74,7 +74,7 @@ def test_embed_models_using_normalize(
model, model,
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
pooler_config=PoolerConfig(normalize=True), pooler_config=PoolerConfig(use_activation=True),
) as vllm_model: ) as vllm_model:
w_normalize = torch.tensor(vllm_model.embed(example_prompts)) w_normalize = torch.tensor(vllm_model.embed(example_prompts))
...@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize( ...@@ -146,7 +146,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model, model,
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
pooler_config=PoolerConfig(normalize=False), pooler_config=PoolerConfig(use_activation=False),
) as vllm_model: ) as vllm_model:
wo_normalize = vllm_model.token_embed(example_prompts) wo_normalize = vllm_model.token_embed(example_prompts)
...@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize( ...@@ -154,7 +154,7 @@ def test_multi_vector_retrieval_models_using_normalize(
model, model,
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
pooler_config=PoolerConfig(normalize=True), pooler_config=PoolerConfig(use_activation=True),
) as vllm_model: ) as vllm_model:
w_normalize = vllm_model.token_embed(example_prompts) w_normalize = vllm_model.token_embed(example_prompts)
......
...@@ -160,7 +160,7 @@ def test_get_pooling_config(): ...@@ -160,7 +160,7 @@ def test_get_pooling_config():
model_config = ModelConfig(model_id) model_config = ModelConfig(model_id)
assert model_config.pooler_config is not None assert model_config.pooler_config is not None
assert model_config.pooler_config.normalize assert model_config.pooler_config.use_activation
assert model_config.pooler_config.seq_pooling_type == "MEAN" assert model_config.pooler_config.seq_pooling_type == "MEAN"
assert model_config.pooler_config.tok_pooling_type == "ALL" assert model_config.pooler_config.tok_pooling_type == "ALL"
......
...@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [ ...@@ -18,7 +18,7 @@ EMBEDDING_MODELS = [
] ]
classify_parameters = ["use_activation"] classify_parameters = ["use_activation"]
embed_parameters = ["dimensions", "normalize"] embed_parameters = ["dimensions", "use_activation"]
step_pooling_parameters = ["step_tag_id", "returned_token_ids"] step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
...@@ -42,17 +42,17 @@ def test_embed(): ...@@ -42,17 +42,17 @@ def test_embed():
task = "embed" task = "embed"
model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS")) model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
pooling_params = PoolingParams(normalize=None) pooling_params = PoolingParams(use_activation=None)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=True) pooling_params = PoolingParams(use_activation=True)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=False) pooling_params = PoolingParams(use_activation=False)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = classify_parameters + step_pooling_parameters invalid_parameters = classify_parameters + step_pooling_parameters
for p in invalid_parameters: for p in set(invalid_parameters) - set(embed_parameters):
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True}) pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
...@@ -98,7 +98,7 @@ def test_classify(task): ...@@ -98,7 +98,7 @@ def test_classify(task):
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = embed_parameters + step_pooling_parameters invalid_parameters = embed_parameters + step_pooling_parameters
for p in invalid_parameters: for p in set(invalid_parameters) - set(classify_parameters):
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True}) pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
...@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str): ...@@ -111,20 +111,20 @@ def test_token_embed(pooling_type: str):
pooler_config=PoolerConfig(tok_pooling_type=pooling_type) pooler_config=PoolerConfig(tok_pooling_type=pooling_type)
) )
pooling_params = PoolingParams(normalize=None) pooling_params = PoolingParams(use_activation=None)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=True) pooling_params = PoolingParams(use_activation=True)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
pooling_params = PoolingParams(normalize=False) pooling_params = PoolingParams(use_activation=False)
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
invalid_parameters = classify_parameters invalid_parameters = classify_parameters
if pooling_type != "STEP": if pooling_type != "STEP":
invalid_parameters = classify_parameters + step_pooling_parameters invalid_parameters = classify_parameters + step_pooling_parameters
for p in invalid_parameters: for p in set(invalid_parameters) - set(embed_parameters):
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True}) pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
...@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str): ...@@ -150,7 +150,7 @@ def test_token_classify(pooling_type: str):
if pooling_type != "STEP": if pooling_type != "STEP":
invalid_parameters = embed_parameters + step_pooling_parameters invalid_parameters = embed_parameters + step_pooling_parameters
for p in invalid_parameters: for p in set(invalid_parameters) - set(classify_parameters):
with pytest.raises(ValueError): with pytest.raises(ValueError):
pooling_params = PoolingParams(**{p: True}) pooling_params = PoolingParams(**{p: True})
pooling_params.verify(task=task, model_config=model_config) pooling_params.verify(task=task, model_config=model_config)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment