Unverified Commit a9078ed0 authored by Pavithra Vijayakrishnan's avatar Pavithra Vijayakrishnan Committed by GitHub
Browse files

ci: Adding nightly pipeline workflow (#4204)


Signed-off-by: default avatarpvijayakrish <pvijayakrish@nvidia.com>
Signed-off-by: default avatarPavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>
parent d03c0976
......@@ -49,6 +49,12 @@ inputs:
torch_backend:
description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
required: false
enable_kvbm:
description: 'Enable KVBM support (optional)'
required: false
dynamo_base_image:
description: 'Pre-built Dynamo base image to use instead of building from scratch'
required: false
outputs:
image_tag:
......@@ -72,14 +78,9 @@ runs:
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
- name: Login to NGC
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
shell: bash
run: |
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- name: Cleanup
if: always()
shell: bash
run: |
docker system prune -af
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ inputs.ngc_ci_access_token }}
- name: Build image
id: build
shell: bash
......@@ -125,6 +126,12 @@ runs:
if [ -n "${{ inputs.torch_backend }}" ]; then
EXTRA_ARGS+=" --build-arg TORCH_BACKEND=${{ inputs.torch_backend }}"
fi
if [ -n "${{ inputs.dynamo_base_image }}" ]; then
EXTRA_ARGS+=" --dynamo-base-image ${{ inputs.dynamo_base_image }}"
fi
if [ -n "${{ inputs.enable_kvbm }}" ]; then
EXTRA_ARGS+=" --build-arg ENABLE_KVBM=${{ inputs.enable_kvbm }}"
fi
# Execute build and capture output (show on console AND save to file)
./container/build.sh --tag "$IMAGE_TAG" \
......@@ -289,7 +296,7 @@ runs:
uses: actions/upload-artifact@v4
if: always()
with:
name: build-metrics-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
name: build-metrics-${{ inputs.framework }}-${{ inputs.target }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
path: build-metrics/build-${{ inputs.framework }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}.json
retention-days: 7
name: 'Docker Login'
description: 'Login to multiple container registries (ECR, NGC, ACR)'
inputs:
ngc_ci_access_token:
description: 'NGC CI Access Token'
required: false
aws_default_region:
description: 'AWS Default Region'
required: false
aws_account_id:
description: 'AWS Account ID'
required: false
azure_acr_hostname:
description: 'Azure ACR hostname'
required: false
azure_acr_user:
description: 'Azure ACR user'
required: false
azure_acr_password:
description: 'Azure ACR password'
required: false
runs:
using: "composite"
steps:
- name: ECR Login
shell: bash
if: ${{ inputs.aws_default_region != '' && inputs.aws_account_id != '' }}
env:
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
set -euo pipefail
aws ecr get-login-password --region ${{ inputs.aws_default_region }} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
- name: NGC Login
if: ${{ inputs.ngc_ci_access_token != '' }}
shell: bash
run: |
set -euo pipefail
echo "${{ inputs.ngc_ci_access_token }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
- name: ACR Login
shell: bash
if: ${{ inputs.azure_acr_hostname != '' && inputs.azure_acr_user != '' && inputs.azure_acr_password != '' }}
run: |
set -euo pipefail
echo "${{ inputs.azure_acr_password }}" | docker login "${{ inputs.azure_acr_hostname }}" --username "${{ inputs.azure_acr_user }}" --password-stdin
name: 'Docker Tag and Push'
description: 'Tag and Push Docker Images'
inputs:
local_image:
description: 'Local Image Name:Tag'
required: true
push_tag:
description: 'Target Name:Tag'
push_tags:
description: 'Target Name:Tag (newline-separated list for multiple tags)'
required: true
aws_push:
description: 'Push to AWS Boolean'
......@@ -38,37 +39,48 @@ inputs:
required: false
outputs:
image_tag:
description: 'Image Tag'
value: ${{ inputs.push_tag }}
image_tags:
description: 'Image Tags'
value: ${{ inputs.push_tags }}
runs:
using: "composite"
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: ACR Login
shell: bash
if: ${{ inputs.azure_push == 'true' }}
run: |
echo "${{ inputs.azure_acr_password }}" | docker login ${{ inputs.azure_acr_hostname }} --username ${{ inputs.azure_acr_user }} --password-stdin
- name: ECR Tag and Push
shell: bash
if: ${{ inputs.aws_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAG: ${{ inputs.push_tag }}
PUSH_TAGS: ${{ inputs.push_tags }}
ECR_HOSTNAME: ${{ inputs.aws_account_id }}.dkr.ecr.${{ inputs.aws_default_region }}.amazonaws.com
run: |
docker tag ${LOCAL_IMAGE} ${ECR_HOSTNAME}/${PUSH_TAG}
docker push ${ECR_HOSTNAME}/${PUSH_TAG}
set -euo pipefail
while IFS= read -r TAG; do
if [ -z "$TAG" ]; then
continue
fi
echo "Tagging and pushing: ${ECR_HOSTNAME}/${TAG}"
docker tag "${LOCAL_IMAGE}" "${ECR_HOSTNAME}/${TAG}"
docker push "${ECR_HOSTNAME}/${TAG}"
done <<< "$PUSH_TAGS"
- name: ACR Tag and Push
shell: bash
if: ${{ inputs.azure_push == 'true' }}
env:
LOCAL_IMAGE: ${{ inputs.local_image }}
PUSH_TAG: ${{ inputs.push_tag }}
PUSH_TAGS: ${{ inputs.push_tags }}
AZURE_ACR_HOSTNAME: ${{ inputs.azure_acr_hostname }}
run: |
docker tag ${LOCAL_IMAGE} ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
docker push ${AZURE_ACR_HOSTNAME}/${PUSH_TAG}
set -euo pipefail
while IFS= read -r TAG; do
if [ -z "$TAG" ]; then
continue
fi
echo "Tagging and pushing: ${AZURE_ACR_HOSTNAME}/${TAG}"
docker tag "${LOCAL_IMAGE}" "${AZURE_ACR_HOSTNAME}/${TAG}"
docker push "${AZURE_ACR_HOSTNAME}/${TAG}"
done <<< "$PUSH_TAGS"
......@@ -24,6 +24,10 @@ inputs:
description: 'Platform architecture (amd64, arm64)'
required: false
default: 'amd64'
dry_run:
description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
required: false
default: 'false'
runs:
......@@ -54,21 +58,32 @@ runs:
# Run pytest with detailed output and JUnit XML
set +e # Don't exit on test failures
# Determine docker runtime flags and pytest command based on dry_run mode
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
echo "🔍 Running pytest in dry-run mode (collect-only, no GPU required)"
GPU_FLAGS=""
PYTEST_CMD="pytest -v --collect-only -m \"${{ inputs.pytest_marks }}\""
else
echo "🚀 Running pytest in normal mode"
PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
# Detect GPU availability and conditionally add GPU flags
GPU_FLAGS=""
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
echo "GPU detected, enabling GPU runtime"
echo "GPU detected, enabling GPU runtime"
GPU_FLAGS="--runtime=nvidia --gpus all"
else
echo "No GPU detected, running in CPU-only mode"
echo "⚠️ No GPU detected, running in CPU-only mode"
fi
fi
docker run ${GPU_FLAGS} --rm -w /workspace \
# Run without --rm so we can copy results even if container crashes (example SIGSEGV exit 139)
docker run ${GPU_FLAGS} -w /workspace \
--cpus=${NUM_CPUS} \
--network host \
--name ${{ env.CONTAINER_ID }}_pytest \
${{ inputs.image_tag }} \
bash -c "mkdir -p /workspace/test-results && pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 -m \"${{ inputs.pytest_marks }}\""
bash -c "mkdir -p /workspace/test-results && ${PYTEST_CMD}"
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
......@@ -92,6 +107,13 @@ runs:
STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_')
echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV
# Skip XML processing if in dry-run mode
if [[ "${{ inputs.dry_run }}" == "true" ]]; then
echo "✅ Dry-run mode: Test collection completed"
echo "⏭️ No JUnit XML generated (dry-run mode)"
exit 0
fi
# Check for JUnit XML file and determine test status
JUNIT_FILE="test-results/pytest_test_report.xml"
......@@ -133,7 +155,7 @@ runs:
- name: Upload Test Results
uses: actions/upload-artifact@v4
if: always() # Always upload test results, even if tests failed
if: always() && inputs.dry_run != 'true' # Skip upload in dry-run mode
with:
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}
path: |
......
......@@ -72,11 +72,10 @@ jobs:
with:
driver: docker
- name: Login to ECR
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME}
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Linter
shell: bash
env:
......@@ -120,7 +119,7 @@ jobs:
uses: ./.github/actions/docker-tag-push
with:
local_image: dynamo-operator:latest
push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
aws_push: 'false'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
......@@ -165,11 +164,18 @@ jobs:
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
......@@ -223,11 +229,18 @@ jobs:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
......@@ -281,11 +294,18 @@ jobs:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
# OPS-1145: Switch aws_push to true
aws_push: 'false'
azure_push: 'true'
......
......@@ -33,8 +33,9 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Login to NGC
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
run: |
echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Define Image Tag
id: define_image_tag
run: |
......
name: Nightly CI
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Nightly CI pipeline
on:
schedule:
- cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC)
workflow_dispatch:
permissions:
contents: read
defaults:
run:
shell: bash --noprofile --norc -eo pipefail {0}
env:
REGISTRY_IMAGE: ai-dynamo/dynamo
NIGHTLY_IMAGE_PREFIX: nightly
############################## BUILD JOBS ##############################
jobs:
vllm:
build-amd64:
name: Build ${{ matrix.framework }} (amd64)
runs-on: cpu-amd-m5-4xlarge
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: vllm (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
framework: [vllm, trtllm, sglang]
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 #v4.3.1
- name: Build vLLM Docker Image
id: build-vllm
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull existing images for cache
shell: bash
continue-on-error: true
run: |
echo "Attempting to pull existing images for layer caching..."
docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64" || echo "Framework image not found in cache"
docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64" || echo "Runtime image not found in cache"
echo "Cache pull completed"
- name: Build Framework Image
id: build_framework
uses: ./.github/actions/docker-build
with:
framework: vllm
framework: ${{ matrix.framework }}
target: framework
platform: linux/amd64
base_image_tag: ''
runtime_image_tag: ''
cuda_version: ''
torch_backend: ''
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
image_tag: framework-${{ matrix.framework }}-amd64:${{ github.run_id }}
- name: Tag and Push Framework Images
uses: ./.github/actions/docker-tag-push
with:
local_image: framework-${{ matrix.framework }}-amd64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64
${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'false'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
- name: Build Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: linux/${{ matrix.platform.arch }}
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
platform: linux/amd64
base_image_tag: ''
runtime_image_tag: ''
cuda_version: ''
torch_backend: ''
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
......@@ -36,70 +98,77 @@ jobs:
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
image_tag: nightly-vllm-${{ matrix.platform.arch }}
- name: Tag and Push vLLM Nightly Image
image_tag: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
- name: Tag and Push Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-vllm.outputs.image_tag }}
# Tag the image nightly
push_tag: ai-dynamo/dynamo:nightly-vllm-${{ matrix.platform.arch }}
aws_push: 'false'
local_image: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64
${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Run unit tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: nightly-vllm-${{ matrix.platform.arch }}
pytest_marks: "vllm and unit"
framework: "vllm"
test_type: "unit"
platform_arch: ${{ matrix.platform.arch }}
- name: Run e2e tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: nightly-vllm-${{ matrix.platform.arch }}
pytest_marks: "nightly and vllm and gpu_1"
framework: "vllm"
test_type: "e2e"
platform_arch: ${{ matrix.platform.arch }}
####################
# Framework Builds #
####################
vllm-framework:
build-arm64:
name: Build ${{ matrix.framework }} (arm64)
runs-on: cpu-arm-r8g-4xlarge
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-4xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: vllm-framework (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
include:
- framework: vllm
base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04'
runtime_image_tag: '12.9.0-runtime-ubuntu24.04'
cuda_version: '129'
torch_backend: 'cu129'
- framework: trtllm
base_image_tag: '25.06-py3'
runtime_image_tag: ''
cuda_version: '129'
torch_backend: 'cu129'
- framework: sglang
base_image_tag: ''
runtime_image_tag: ''
cuda_version: ''
torch_backend: ''
env:
FRAMEWORK: vllm
steps: &framework-build-steps
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 #v4.3.1
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
ref: main
- name: Build Image
id: build-image
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull existing images for cache
shell: bash
continue-on-error: true
run: |
echo "Attempting to pull existing images for layer caching..."
docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64" || echo "Framework image not found in cache"
docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64" || echo "Runtime image not found in cache"
echo "Cache pull completed"
- name: Build Framework Image
id: build_framework
uses: ./.github/actions/docker-build
with:
framework: ${{ env.FRAMEWORK }}
framework: ${{ matrix.framework }}
target: framework
platform: linux/${{ matrix.platform.arch }}
# Ternary operations that are specific to vllm/arm64, empty str for all other combinations
base_image_tag: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && '129' || '' }}
torch_backend: ${{ (matrix.platform.arch == 'arm64' && env.FRAMEWORK == 'vllm') && 'cu129' || '' }}
platform: linux/arm64
base_image_tag: ${{ matrix.base_image_tag }}
runtime_image_tag: ${{ matrix.runtime_image_tag }}
cuda_version: ${{ matrix.cuda_version }}
torch_backend: ${{ matrix.torch_backend }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
......@@ -107,39 +176,630 @@ jobs:
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
image_tag: framework-${{ matrix.framework }}-arm64:${{ github.run_id }}
- name: Tag and Push Framework Images
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tag: ai-dynamo/dynamo:main-${{ env.FRAMEWORK }}-framework-${{ matrix.platform.arch }}
local_image: framework-${{ matrix.framework }}-arm64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64
${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'false'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
- name: Build Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: linux/arm64
base_image_tag: ${{ matrix.base_image_tag }}
runtime_image_tag: ${{ matrix.runtime_image_tag }}
cuda_version: ${{ matrix.cuda_version }}
torch_backend: ${{ matrix.torch_backend }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
image_tag: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
- name: Tag and Push Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64
${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
sglang-framework:
############################## TEST JOBS ##############################
unit-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-unit
needs: [build-amd64, build-arm64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
- arch: arm64
runner: cpu-arm-r8g-4xlarge
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
# Determine which build job to check
if [ "${{ matrix.arch.arch }}" = "amd64" ]; then
BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)"
else
BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)"
fi
# Query GitHub API for job status using curl (token from env to avoid log exposure)
JOBS=$(curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
if [ $? -ne 0 ]; then
echo "Error: Failed to query GitHub API"
exit 1
fi
# Find the specific build job and check its conclusion
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
if [ "$BUILD_STATUS" != "success" ]; then
echo "Build failed or did not complete successfully. Failing tests."
exit 1
fi
echo "Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull nightly image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run Unit Tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: "unit and (nightly or post_merge or pre_merge)"
framework: ${{ matrix.framework }}
test_type: unit
platform_arch: ${{ matrix.arch.arch }}
cpu_limit: '8'
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
integration-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-integ
needs: [build-amd64, build-arm64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-4xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: sglang-framework (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
timeout: 90
- arch: arm64
runner: cpu-arm-r8g-4xlarge
timeout: 90
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
FRAMEWORK: sglang
steps: *framework-build-steps
trtllm-framework:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
JOBS=$(curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
if [ $? -ne 0 ]; then
echo "Error: Failed to query GitHub API"
exit 1
fi
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
if [ "$BUILD_STATUS" != "success" ]; then
echo "Build failed or did not complete successfully. Marking tests as failed."
exit 1
fi
echo "Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull nightly image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run Integration Tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: "integration and (nightly or post_merge or pre_merge)"
framework: ${{ matrix.framework }}
test_type: integration
platform_arch: ${{ matrix.arch.arch }}
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
e2e-single-gpu-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-1gpu-e2e
needs: [build-amd64, build-arm64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: cpu-amd-m5-4xlarge }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: trtllm-framework (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
timeout: 120
- arch: arm64
runner: cpu-arm-r8g-4xlarge
timeout: 120
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
JOBS=$(curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
if [ $? -ne 0 ]; then
echo "Error: Failed to query GitHub API"
echo "skip=true" >> $GITHUB_OUTPUT
exit 0
fi
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
if [ "$BUILD_STATUS" != "success" ]; then
echo "Build failed or did not complete successfully. Failing tests."
exit 1
fi
echo "Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull nightly image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run E2E Tests (gpu_1)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: "${{ matrix.framework }} and e2e and gpu_1"
framework: ${{ matrix.framework }}
test_type: e2e-single-gpu
platform_arch: ${{ matrix.arch.arch }}
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
e2e-multi-gpu-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-2gpu-e2e
needs: [build-amd64, build-arm64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: gpu-l40-amd64
timeout: 150
- arch: arm64
runner: cpu-arm-r8g-4xlarge
timeout: 150
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})"
JOBS=$(curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
if [ $? -ne 0 ]; then
echo "Error: Failed to query GitHub API"
echo "skip=true" >> $GITHUB_OUTPUT
exit 0
fi
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
if [ "$BUILD_STATUS" != "success" ]; then
echo "Build failed or did not complete successfully. Marking tests as failed."
exit 1
fi
echo "Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull nightly image
shell: bash
env:
FRAMEWORK: trtllm
steps: *framework-build-steps
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run E2E Tests (gpu_2)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: "(nightly or post_merge or pre_merge) and e2e and gpu_2"
framework: ${{ matrix.framework }}
test_type: e2e-multi-gpu
platform_arch: ${{ matrix.arch.arch }}
dry_run: 'true'
# component-tests:
# name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-${{ matrix.component }}
# needs: [build-amd64, build-arm64]
# if: always()
# runs-on: ${{ matrix.arch.runner }}
# timeout-minutes: ${{ matrix.arch.timeout }}
# strategy:
# fail-fast: false
# matrix:
# framework: [vllm, trtllm, sglang]
# arch:
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: router
# marks: "nightly and router"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: planner
# marks: "nightly and planner"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: router
# marks: "nightly and router"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: planner
# marks: "nightly and planner"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: router
# marks: "nightly and router"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: planner
# marks: "nightly and planner"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: router
# marks: "nightly and router"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: planner
# marks: "nightly and planner"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: router
# marks: "nightly and router"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 90
# component: planner
# marks: "nightly and planner"
# - arch: amd64
# runner: gpu-l40-amd64
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: router
# marks: "nightly and router"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 60
# component: planner
# marks: "nightly and planner"
# - arch: arm64
# runner: cpu-arm-r8g-4xlarge
# timeout: 150
# component: kvbm
# marks: "nightly and (kvbm or kvbm_v2)"
# steps:
# - uses: actions/checkout@v4
# - name: Check if build succeeded
# id: check_build
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# run: |
# set +x
# echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})"
# if [ "${{ matrix.arch.arch }}" = "amd64" ]; then
# BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)"
# else
# BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)"
# fi
# JOBS=$(curl -s -S -L --fail-with-body \
# -H "Authorization: Bearer ${GITHUB_TOKEN}" \
# -H "Accept: application/vnd.github.v3+json" \
# "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1)
# if [ $? -ne 0 ]; then
# echo "Error: Failed to query GitHub API"
# echo "skip=true" >> $GITHUB_OUTPUT
# exit 0
# fi
# BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion')
# echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS"
# if [ "$BUILD_STATUS" != "success" ]; then
# echo "Build failed or did not complete successfully. Marking tests as failed."
# exit 1
# fi
# echo "Build succeeded. Proceeding with tests."
# - name: Login to Container Registries
# uses: ./.github/actions/docker-login
# with:
# aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
# aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
# - name: Pull nightly image
# shell: bash
# env:
# ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
# IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
# run: |
# docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
# docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
# - name: Run Component Tests (${{ matrix.component }})
# uses: ./.github/actions/pytest
# with:
# image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
# pytest_marks: "${{ matrix.marks }}"
# framework: ${{ matrix.framework }}
# test_type: component-${{ matrix.component }}
# platform_arch: ${{ matrix.arch.arch }}
############################## RESULTS SUMMARY ##############################
results-summary:
name: Results Summary
runs-on: ubuntu-latest
if: always()
needs: [build-amd64, build-arm64, unit-tests, integration-tests, e2e-single-gpu-tests, e2e-multi-gpu-tests] # component-tests
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Gather job metadata
id: gather
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x -e
echo "# Nightly CI Results Summary" > results.md
echo "" >> results.md
echo "| Stage | Status | Runner | Duration (min) | Artifacts |" >> results.md
echo "|-------|--------|--------|----------------|-----------|" >> results.md
curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
2>/dev/null | jq -c '.jobs[] | {id, name, runner_name, conclusion, started_at, completed_at}' > jobs.jsonl
while read job_entry; do
job_id=$(echo "$job_entry" | jq -r '.id')
name=$(echo "$job_entry" | jq -r '.name')
runner=$(echo "$job_entry" | jq -r '.runner_name')
status=$(echo "$job_entry" | jq -r '.conclusion')
started=$(echo "$job_entry" | jq -r '.started_at')
completed=$(echo "$job_entry" | jq -r '.completed_at')
minutes="N/A"
if [[ "$started" != "null" && "$completed" != "null" ]]; then
start_epoch=$(date -d "$started" +%s)
end_epoch=$(date -d "$completed" +%s)
minutes=$(( (end_epoch - start_epoch)/60 ))
fi
artifact_link="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#job-$job_id"
printf "| %s | %s | %s | %s | [Log & Artifacts](%s) |\n" "$name" "$status" "$runner" "$minutes" "$artifact_link" >> results.md
done < jobs.jsonl
echo "" >> results.md
echo "---" >> results.md
- name: Display workflow summary
run: cat results.md
- name: Upload results summary as job summary
run: cat results.md >> $GITHUB_STEP_SUMMARY
- name: Upload results as artifact for Slack
uses: actions/upload-artifact@v4
if: always()
with:
name: nightly-results-summary
path: results.md
retention-days: 7
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: cpu-amd-m5-4xlarge
if: always() && github.event_name == 'schedule' && !github.event.repository.fork
needs: results-summary
permissions:
contents: read
env:
HAS_SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL != '' }}
steps:
- name: Send Slack notification
if: env.HAS_SLACK_WEBHOOK == 'true'
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
run: |
set -euo pipefail
JOBS_JSON=$(mktemp)
trap 'rm -f "$JOBS_JSON"' EXIT
if ! curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
> "$JOBS_JSON"; then
echo "Error: Failed to fetch job data from GitHub API"
exit 1
fi
if [ ! -s "$JOBS_JSON" ]; then
echo "Error: No job data received"
exit 1
fi
TOTAL_JOBS=$(jq '[.jobs[]] | length' "$JOBS_JSON")
SUCCESS_COUNT=$(jq '[.jobs[] | select(.conclusion == "success")] | length' "$JOBS_JSON")
FAILED_COUNT=$(jq '[.jobs[] | select(.conclusion == "failure")] | length' "$JOBS_JSON")
if [ "$FAILED_COUNT" -eq 0 ]; then
STATUS="Success ✅"
STATUS_EMOJI=":white_check_mark:"
else
STATUS="Failed ❌"
STATUS_EMOJI=":x:"
fi
# Main message with summary
SUMMARY_TEXT="*Nightly CI Pipeline - ${STATUS}*"$'\n'"Summary: ${SUCCESS_COUNT}/${TOTAL_JOBS} jobs passed"$'\n'"<${RUN_URL}|View Workflow Summary>"
if [ "$FAILED_COUNT" -eq 0 ]; then
# Success - simple message
PAYLOAD=$(jq -n \
--arg text "$SUMMARY_TEXT" \
'{text: $text}')
else
# Failed - message with blocks
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | "• " + .name' "$JOBS_JSON")
FAILED_JOBS_TEXT="*Failed Jobs (${FAILED_COUNT}):*"$'\n'"${FAILED_JOBS}"
PAYLOAD=$(jq -n \
--arg summary "$SUMMARY_TEXT" \
--arg failed "$FAILED_JOBS_TEXT" \
'{
text: $summary,
blocks: [
{
type: "section",
text: {
type: "mrkdwn",
text: $summary
}
},
{
type: "section",
text: {
type: "mrkdwn",
text: $failed
}
}
]
}')
fi
if curl -sSf -X POST -H "Content-Type: application/json" -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"; then
echo "Slack notification sent successfully"
else
echo "Warning: Failed to send Slack notification"
exit 1
fi
......@@ -384,10 +384,15 @@ RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "$ENABLE_KVBM" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /workspace/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache .
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$KVBM_WHEEL"; \
fi && \
cd /workspace/benchmarks && \
UV_GIT_LFS=1 uv pip install --no-cache .
# Setup launch banner in common directory accessible to all users
RUN --mount=type=bind,source=./container/launch_message/runtime.txt,target=/opt/dynamo/launch_message.txt \
......
......@@ -376,6 +376,7 @@ RUN chmod 755 /opt/dynamo/.launch_screen && \
USER dynamo
# Copy tests, benchmarks, deploy and components for CI with correct ownership
COPY --chown=dynamo: pyproject.toml /workspace/
COPY --chown=dynamo: tests /workspace/tests
COPY --chown=dynamo: examples /workspace/examples
COPY --chown=dynamo: benchmarks /workspace/benchmarks
......@@ -477,7 +478,7 @@ RUN curl --retry 3 --retry-delay 2 -LSso /usr/local/bin/clang-format https://git
&& rm -rf clangd_18.1.3 clangd.zip
# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
COPY README.md hatch_build.py /workspace/
RUN python3 -m pip install --no-deps -e .
# Install Python development packages
......
......@@ -334,12 +334,17 @@ RUN uv pip install \
--no-cache \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install --no-cache /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /workspace/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache .
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install --no-cache "$KVBM_WHEEL"; \
fi && \
cd /workspace/benchmarks && \
UV_GIT_LFS=1 uv pip install --no-cache .
# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
......@@ -352,7 +357,8 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
--requirement /tmp/requirements.test.txt \
cupy-cuda13x
# Copy tests, benchmarks, deploy and components for CI
# Copy tests, benchmarks, deploy and components for CI with correct ownership
COPY --chown=dynamo: pyproject.toml /workspace/
COPY --chown=dynamo: tests /workspace/tests
COPY --chown=dynamo: examples /workspace/examples
COPY --chown=dynamo: deploy /workspace/deploy
......@@ -442,7 +448,7 @@ COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
RUN uv pip install --no-cache maturin[patchelf]
# Editable install of dynamo
COPY pyproject.toml README.md hatch_build.py /workspace/
COPY README.md hatch_build.py /workspace/
RUN uv pip install --no-cache --no-deps -e .
CMD []
......@@ -275,12 +275,17 @@ COPY --chown=dynamo: --from=dynamo_base /opt/dynamo/wheelhouse/ /opt/dynamo/whee
RUN uv pip install \
/opt/dynamo/wheelhouse/ai_dynamo_runtime*.whl \
/opt/dynamo/wheelhouse/ai_dynamo*any.whl \
/opt/dynamo/wheelhouse/nixl/nixl*.whl \
&& if [ "${ENABLE_KVBM}" = "true" ]; then \
uv pip install /opt/dynamo/wheelhouse/kvbm*.whl; \
fi \
&& cd /workspace/benchmarks \
&& UV_GIT_LFS=1 uv pip install --no-cache .
/opt/dynamo/wheelhouse/nixl/nixl*.whl && \
if [ "${ENABLE_KVBM}" = "true" ]; then \
KVBM_WHEEL=$(ls /opt/dynamo/wheelhouse/kvbm*.whl 2>/dev/null | head -1); \
if [ -z "$KVBM_WHEEL" ]; then \
echo "ERROR: ENABLE_KVBM is true but no KVBM wheel found in wheelhouse" >&2; \
exit 1; \
fi; \
uv pip install "$KVBM_WHEEL"; \
fi && \
cd /workspace/benchmarks && \
UV_GIT_LFS=1 uv pip install --no-cache .
# Install common and test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
......
......@@ -898,7 +898,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
# Use BuildKit for enhanced metadata
if [ -z "$RUN_PREFIX" ]; then
if docker buildx version &>/dev/null; then
docker buildx build --progress=plain --load -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
docker buildx build --builder default --progress=plain --load -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
BUILD_EXIT_CODE=${PIPESTATUS[0]}
else
DOCKER_BUILDKIT=1 docker build --progress=plain -f "${SOURCE_DIR}/Dockerfile" --target runtime $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO --tag $DYNAMO_BASE_IMAGE $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${BASE_BUILD_LOG}"
......@@ -924,7 +924,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
# Use BuildKit for enhanced metadata
if [ -z "$RUN_PREFIX" ]; then
if docker buildx version &>/dev/null; then
docker buildx build --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
docker buildx build --builder default --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
BUILD_EXIT_CODE=${PIPESTATUS[0]}
else
DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${FRAMEWORK_BUILD_LOG}"
......@@ -946,7 +946,7 @@ if [[ -z "${DEV_IMAGE_INPUT:-}" ]]; then
# Use BuildKit for enhanced metadata
if [ -z "$RUN_PREFIX" ]; then
if docker buildx version &>/dev/null; then
docker buildx build --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
docker buildx build --builder default --progress=plain --load -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
BUILD_EXIT_CODE=${PIPESTATUS[0]}
else
DOCKER_BUILDKIT=1 docker build --progress=plain -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $CACHE_TO $TAG $LATEST_TAG $BUILD_CONTEXT_ARG $BUILD_CONTEXT $NO_CACHE 2>&1 | tee "${SINGLE_BUILD_LOG}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment