Unverified Commit b569620f authored by Andrey Talman's avatar Andrey Talman Committed by GitHub
Browse files

[CI] Add PyTorch nightly build and test pipeline (#37226)


Signed-off-by: default avataratalman <atalman@fb.com>
parent 65b98089
#!/bin/bash
set -euo pipefail
# Build a vLLM test image with PyTorch nightly installed.
# Called by the pipeline generator's "vLLM Against PyTorch Nightly" group.
if [[ $# -lt 5 ]]; then
echo "Usage: $0 <registry> <repo> <commit> <branch> <image_tag>"
exit 1
fi
REGISTRY=$1
REPO=$2
BUILDKITE_COMMIT=$3
BRANCH=$4
IMAGE_TAG=$5
# --- Arguments ---
echo "--- :mag: Arguments"
echo "REGISTRY: ${REGISTRY}"
echo "REPO: ${REPO}"
echo "BUILDKITE_COMMIT: ${BUILDKITE_COMMIT}"
echo "BRANCH: ${BRANCH}"
echo "IMAGE_TAG: ${IMAGE_TAG}"
# --- ECR login ---
echo "--- :key: ECR login"
aws ecr-public get-login-password --region us-east-1 \
| docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 \
| docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
# --- Set up buildx ---
echo "--- :docker: Setting up buildx"
docker buildx create --name vllm-builder --driver docker-container --use || true
docker buildx inspect --bootstrap
docker buildx ls
# --- Skip if image already exists ---
echo "--- :mag: Checking if image already exists"
if docker manifest inspect "$IMAGE_TAG" >/dev/null 2>&1; then
echo "Image found: $IMAGE_TAG — skipping build"
exit 0
fi
echo "Image not found, proceeding with build..."
# --- CUDA 13.0 for nightly builds ---
# Nightly CI uses CUDA 13.0 while regular CI stays on CUDA 12.9
NIGHTLY_CUDA_VERSION="13.0.0"
NIGHTLY_BUILD_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-devel-ubuntu22.04"
NIGHTLY_FINAL_BASE_IMAGE="nvidia/cuda:${NIGHTLY_CUDA_VERSION}-base-ubuntu22.04"
echo "--- :docker: Building torch nightly image (CUDA ${NIGHTLY_CUDA_VERSION})"
docker buildx build --file docker/Dockerfile \
--build-arg max_jobs=16 \
--build-arg buildkite_commit="$BUILDKITE_COMMIT" \
--build-arg USE_SCCACHE=1 \
--build-arg PYTORCH_NIGHTLY=1 \
--build-arg CUDA_VERSION="${NIGHTLY_CUDA_VERSION}" \
--build-arg BUILD_BASE_IMAGE="${NIGHTLY_BUILD_BASE_IMAGE}" \
--build-arg FINAL_BASE_IMAGE="${NIGHTLY_FINAL_BASE_IMAGE}" \
--build-arg torch_cuda_arch_list="8.0 8.9 9.0 10.0 12.0" \
--tag "$IMAGE_TAG" \
--push \
--target test \
--progress plain .
echo "--- :white_check_mark: Torch nightly image build complete: $IMAGE_TAG"
......@@ -13,10 +13,11 @@ steps:
commands:
# Run a subset of model initialization tests
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
mirror:
torch_nightly: {}
- label: Basic Models Tests (Extra Initialization) %N
timeout_in_minutes: 45
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/test_initialization.py
......@@ -27,6 +28,8 @@ steps:
# test.) Also run if model initialization test file is modified
- pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
mirror:
torch_nightly: {}
- label: Basic Models Tests (Other)
timeout_in_minutes: 45
......
......@@ -4,7 +4,6 @@ depends_on:
steps:
- label: Language Models Tests (Standard)
timeout_in_minutes: 25
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language
......@@ -12,10 +11,11 @@ steps:
# Test standard language models, excluding a subset of slow tests
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and (not slow_test)'
mirror:
torch_nightly: {}
- label: Language Models Tests (Extra Standard) %N
timeout_in_minutes: 45
torch_nightly: true
source_file_dependencies:
- vllm/model_executor/models/
- tests/models/language/pooling/test_embedding.py
......@@ -27,10 +27,11 @@ steps:
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
mirror:
torch_nightly: {}
- label: Language Models Tests (Hybrid) %N
timeout_in_minutes: 75
torch_nightly: true
source_file_dependencies:
- vllm/
- tests/models/language/generation
......@@ -42,6 +43,8 @@ steps:
# Shard hybrid language model tests
- pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
parallelism: 2
mirror:
torch_nightly: {}
- label: Language Models Test (Extended Generation) # 80min
timeout_in_minutes: 110
......@@ -62,7 +65,7 @@ steps:
- image-build-amd
commands:
- uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment