Unverified Commit 3dcc53d5 authored by Pavithra Vijayakrishnan's avatar Pavithra Vijayakrishnan Committed by GitHub
Browse files

ci: automate dynamo rcs (#6572)


Signed-off-by: default avatarpvijayakrish <pvijayakrish@nvidia.com>
parent 96fc1ab6
...@@ -10,6 +10,17 @@ on: ...@@ -10,6 +10,17 @@ on:
- "pull-request/[0-9]+" - "pull-request/[0-9]+"
# Note: release/* branches are handled by release.yml which calls this workflow # Note: release/* branches are handled by release.yml which calls this workflow
workflow_call: workflow_call:
inputs:
skip_change_detection:
description: 'Skip changed-files detection and always build (used by release pipeline)'
required: false
type: boolean
default: false
image_prefix:
description: 'Optional prefix for image tags (e.g., release-0.9.0). When set, images are also tagged as {prefix}-frontend-{arch}.'
required: false
type: string
default: ''
secrets: secrets:
AWS_ACCOUNT_ID: AWS_ACCOUNT_ID:
required: true required: true
...@@ -25,6 +36,8 @@ on: ...@@ -25,6 +36,8 @@ on:
required: true required: true
AZURE_ACR_PASSWORD: AZURE_ACR_PASSWORD:
required: true required: true
CI_TOKEN:
required: false
SCCACHE_S3_BUCKET: SCCACHE_S3_BUCKET:
required: true required: true
...@@ -59,7 +72,7 @@ jobs: ...@@ -59,7 +72,7 @@ jobs:
build-epp-image: build-epp-image:
name: Build EPP Image name: Build EPP Image
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.frontend == 'true' if: needs.changed-files.outputs.frontend == 'true' || inputs.skip_change_detection == true
runs-on: prod-builder-v3 runs-on: prod-builder-v3
outputs: outputs:
epp_image_ref: ${{ steps.build-epp-image.outputs.epp_image_ref }} epp_image_ref: ${{ steps.build-epp-image.outputs.epp_image_ref }}
...@@ -111,7 +124,7 @@ jobs: ...@@ -111,7 +124,7 @@ jobs:
build-frontend-image: build-frontend-image:
name: Build Frontend Image name: Build Frontend Image
needs: [changed-files, build-epp-image] needs: [changed-files, build-epp-image]
if: needs.changed-files.outputs.frontend == 'true' if: needs.changed-files.outputs.frontend == 'true' || inputs.skip_change_detection == true
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -196,6 +209,7 @@ jobs: ...@@ -196,6 +209,7 @@ jobs:
${{ matrix.arch == 'amd64' && steps.calculate-target-tag.outputs.azure_target_image_uri || '' }} ${{ matrix.arch == 'amd64' && steps.calculate-target-tag.outputs.azure_target_image_uri || '' }}
${{ github.ref_name == 'main' && format('{0}:main-frontend-{1}', steps.calculate-target-tag.outputs.ecr_image_base, matrix.arch) || '' }} ${{ github.ref_name == 'main' && format('{0}:main-frontend-{1}', steps.calculate-target-tag.outputs.ecr_image_base, matrix.arch) || '' }}
${{ github.ref_name == 'main' && format('{0}:main-frontend-{1}-{2}', steps.calculate-target-tag.outputs.ecr_image_base, github.sha, matrix.arch) || '' }} ${{ github.ref_name == 'main' && format('{0}:main-frontend-{1}-{2}', steps.calculate-target-tag.outputs.ecr_image_base, github.sha, matrix.arch) || '' }}
${{ inputs.image_prefix != '' && format('{0}:{1}-frontend-{2}', steps.calculate-target-tag.outputs.ecr_image_base, inputs.image_prefix, matrix.arch) || '' }}
- name: Show summary - name: Show summary
shell: bash shell: bash
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Reusable CI Test Suite Workflow
# This workflow is called by nightly-ci.yml and post-merge-ci.yml
# to run the full test suite with configurable parameters.
name: CI Test Suite
on:
workflow_call:
inputs:
pipeline_type:
description: 'Type of pipeline: nightly or post_merge'
required: true
type: string
include_nightly_marks:
description: 'Include nightly pytest marks in test selection'
required: true
type: boolean
image_prefix:
description: 'Prefix for image tags (nightly or main)'
required: true
type: string
enable_slack_notification:
description: 'Enable Slack notifications on completion'
required: false
type: boolean
default: false
secrets:
AWS_ACCOUNT_ID:
required: true
AWS_DEFAULT_REGION:
required: true
AWS_ACCESS_KEY_ID:
required: true
AWS_SECRET_ACCESS_KEY:
required: true
NGC_CI_ACCESS_TOKEN:
required: true
CI_TOKEN:
required: true
SCCACHE_S3_BUCKET:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL:
required: false
SLACK_OPS_SUPPORT_GROUP_ID:
required: false
AZURE_AKS_CI_KUBECONFIG_B64:
required: false
HF_TOKEN:
required: false
DYNAMO_INGRESS_SUFFIX:
required: false
permissions:
contents: read
defaults:
run:
shell: bash --noprofile --norc -eo pipefail {0}
env:
REGISTRY_IMAGE: ai-dynamo/dynamo
IMAGE_PREFIX: ${{ inputs.image_prefix }}
############################## BUILD JOBS ##############################
jobs:
build-amd64:
name: Build ${{ matrix.framework }} (amd64)
runs-on: prod-builder-amd-v1
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
framework: [vllm, sglang]
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: amd64
cuda_version: '12.9'
image_tag: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Tag and Push Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-amd64
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-amd64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
build-arm64:
name: Build ${{ matrix.framework }} (arm64)
runs-on: prod-builder-arm-v1
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
include:
- framework: vllm
cuda_version: '12.9'
- framework: sglang
cuda_version: '12.9'
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: arm64
cuda_version: ${{ matrix.cuda_version }}
image_tag: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Tag and Push Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-arm64
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-arm64-run-${{ github.run_id }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
# CUDA 13 builds (vllm and sglang only, both architectures)
build-cuda13-amd64:
name: Build ${{ matrix.framework }} CUDA13 (amd64)
runs-on: prod-builder-amd-v1
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build CUDA 13 Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: amd64
cuda_version: ${{ matrix.framework == 'trtllm' && '13.1' || '13.0' }}
image_tag: runtime-${{ matrix.framework }}-cuda13-amd64:${{ github.run_id }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Tag and Push CUDA 13 Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: runtime-${{ matrix.framework }}-cuda13-amd64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-cuda13-amd64
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-cuda13-amd64-run-${{ github.run_id }}
${{ matrix.framework == 'trtllm' && format('{0}:{1}-{2}-amd64', env.REGISTRY_IMAGE, env.IMAGE_PREFIX, matrix.framework) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
build-cuda13-arm64:
name: Build ${{ matrix.framework }} CUDA13 (arm64)
runs-on: prod-builder-arm-v1
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
steps:
- uses: actions/checkout@v4
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build CUDA 13 Runtime Image
id: build_runtime
uses: ./.github/actions/docker-build
with:
framework: ${{ matrix.framework }}
target: runtime
platform: arm64
cuda_version: ${{ matrix.framework == 'trtllm' && '13.1' || '13.0' }}
image_tag: runtime-${{ matrix.framework }}-cuda13-arm64:${{ github.run_id }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Tag and Push CUDA 13 Runtime Images
uses: ./.github/actions/docker-tag-push
with:
local_image: runtime-${{ matrix.framework }}-cuda13-arm64:${{ github.run_id }}
push_tags: |
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-cuda13-arm64
${{ env.REGISTRY_IMAGE }}:${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-cuda13-arm64-run-${{ github.run_id }}
${{ matrix.framework == 'trtllm' && format('{0}:{1}-{2}-arm64', env.REGISTRY_IMAGE, env.IMAGE_PREFIX, matrix.framework) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
############################## TEST JOBS ##############################
unit-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-unit
needs: [build-amd64, build-arm64, build-cuda13-arm64, build-cuda13-amd64]
if: always() && inputs.skip_tests != true
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: 45
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: prod-builder-amd-gpu-v1
- arch: arm64
runner: prod-builder-arm-v1
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "==========================================="
echo "DEBUG: Checking build status"
echo "==========================================="
echo "Framework: ${{ matrix.framework }}"
echo "Architecture: ${{ matrix.arch.arch }}"
echo "Repository: ${{ github.repository }}"
echo "Run ID: ${{ github.run_id }}"
BUILD_JOB_PATTERN="Build ${{ matrix.framework }} ${{ matrix.framework == 'trtllm' && 'CUDA13 ' || '' }}(${{ matrix.arch.arch }})"
echo "Looking for job pattern: '$BUILD_JOB_PATTERN'"
# Query GitHub API for job status
echo ""
echo "Querying GitHub API..."
JOBS_RESPONSE=$(curl -s -S -L -w "\n%{http_code}" \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100")
HTTP_CODE=$(echo "$JOBS_RESPONSE" | tail -n1)
JOBS=$(echo "$JOBS_RESPONSE" | sed '$d')
echo "HTTP Response Code: $HTTP_CODE"
if [ "$HTTP_CODE" != "200" ]; then
echo "Error: GitHub API returned non-200 status code"
echo "Response: $JOBS"
exit 1
fi
# Debug: Show total jobs and all job names
TOTAL_JOBS=$(echo "$JOBS" | jq '.jobs | length')
echo ""
echo "Total jobs found: $TOTAL_JOBS"
echo ""
echo "All job names in this workflow run:"
echo "$JOBS" | jq -r '.jobs[] | " - \(.name) [status: \(.status), conclusion: \(.conclusion)]"'
echo ""
# Try exact endswith match
echo "Searching for jobs ending with: '$BUILD_JOB_PATTERN'"
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | endswith($pattern))] | length')
echo "Jobs matching endswith pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -eq 0 ]; then
echo ""
echo "WARNING: No jobs found with endswith pattern"
echo "Trying contains pattern instead..."
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | contains($pattern))] | length')
echo "Jobs matching contains pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -gt 0 ]; then
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .name' | head -1)
fi
else
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .name' | head -1)
fi
echo ""
echo "==========================================="
echo "RESULT:"
echo " Matched job: ${MATCHED_JOB_NAME:-none}"
echo " Build status: ${BUILD_STATUS:-not found}"
echo "==========================================="
# Handle various status cases
if [ -z "$BUILD_STATUS" ] || [ "$BUILD_STATUS" = "null" ] || [ "$BUILD_STATUS" = "" ]; then
echo ""
echo "ERROR: Could not determine build status"
echo "This could mean:"
echo " 1. The build job is still running"
echo " 2. The job name pattern doesn't match"
echo " 3. The API response doesn't include this job yet"
exit 1
fi
if [ "$BUILD_STATUS" != "success" ]; then
echo ""
echo "ERROR: Build did not succeed (status: $BUILD_STATUS)"
exit 1
fi
echo ""
echo "✅ Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run Unit Tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: ${{ inputs.include_nightly_marks && 'unit and (nightly or post_merge or pre_merge)' || 'unit and (post_merge or pre_merge)' }}
framework: ${{ matrix.framework }}
test_type: unit
platform_arch: ${{ matrix.arch.arch }}
cpu_limit: '8'
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
integration-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-integ
needs: [build-amd64, build-arm64, build-cuda13-arm64, build-cuda13-amd64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: prod-builder-amd-gpu-v1
timeout: 90
- arch: arm64
runner: prod-builder-arm-v1
timeout: 90
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "==========================================="
echo "DEBUG: Checking build status"
echo "==========================================="
echo "Framework: ${{ matrix.framework }}"
echo "Architecture: ${{ matrix.arch.arch }}"
echo "Repository: ${{ github.repository }}"
echo "Run ID: ${{ github.run_id }}"
BUILD_JOB_PATTERN="Build ${{ matrix.framework }} ${{ matrix.framework == 'trtllm' && 'CUDA13 ' || '' }}(${{ matrix.arch.arch }})"
echo "Looking for job pattern: '$BUILD_JOB_PATTERN'"
# Query GitHub API for job status
echo ""
echo "Querying GitHub API..."
JOBS_RESPONSE=$(curl -s -S -L -w "\n%{http_code}" \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100")
HTTP_CODE=$(echo "$JOBS_RESPONSE" | tail -n1)
JOBS=$(echo "$JOBS_RESPONSE" | sed '$d')
echo "HTTP Response Code: $HTTP_CODE"
if [ "$HTTP_CODE" != "200" ]; then
echo "Error: GitHub API returned non-200 status code"
echo "Response: $JOBS"
exit 1
fi
# Debug: Show total jobs and all job names
TOTAL_JOBS=$(echo "$JOBS" | jq '.jobs | length')
echo ""
echo "Total jobs found: $TOTAL_JOBS"
echo ""
echo "All job names in this workflow run:"
echo "$JOBS" | jq -r '.jobs[] | " - \(.name) [status: \(.status), conclusion: \(.conclusion)]"'
echo ""
# Try exact endswith match
echo "Searching for jobs ending with: '$BUILD_JOB_PATTERN'"
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | endswith($pattern))] | length')
echo "Jobs matching endswith pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -eq 0 ]; then
echo ""
echo "WARNING: No jobs found with endswith pattern"
echo "Trying contains pattern instead..."
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | contains($pattern))] | length')
echo "Jobs matching contains pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -gt 0 ]; then
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .name' | head -1)
fi
else
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .name' | head -1)
fi
echo ""
echo "==========================================="
echo "RESULT:"
echo " Matched job: ${MATCHED_JOB_NAME:-none}"
echo " Build status: ${BUILD_STATUS:-not found}"
echo "==========================================="
# Handle various status cases
if [ -z "$BUILD_STATUS" ] || [ "$BUILD_STATUS" = "null" ] || [ "$BUILD_STATUS" = "" ]; then
echo ""
echo "ERROR: Could not determine build status"
echo "This could mean:"
echo " 1. The build job is still running"
echo " 2. The job name pattern doesn't match"
echo " 3. The API response doesn't include this job yet"
exit 1
fi
if [ "$BUILD_STATUS" != "success" ]; then
echo ""
echo "ERROR: Build did not succeed (status: $BUILD_STATUS)"
exit 1
fi
echo ""
echo "✅ Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run Integration Tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: ${{ inputs.include_nightly_marks && 'integration and (nightly or post_merge or pre_merge)' || 'integration and (post_merge or pre_merge)' }}
framework: ${{ matrix.framework }}
test_type: integration
platform_arch: ${{ matrix.arch.arch }}
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
e2e-single-gpu-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-1gpu-e2e
needs: [build-amd64, build-arm64, build-cuda13-arm64, build-cuda13-amd64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: prod-builder-amd-gpu-v1
timeout: 180
- arch: arm64
runner: prod-builder-arm-v1
timeout: 180
steps:
- uses: actions/checkout@v4
with:
lfs: true
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "==========================================="
echo "DEBUG: Checking build status"
echo "==========================================="
echo "Framework: ${{ matrix.framework }}"
echo "Architecture: ${{ matrix.arch.arch }}"
echo "Repository: ${{ github.repository }}"
echo "Run ID: ${{ github.run_id }}"
BUILD_JOB_PATTERN="Build ${{ matrix.framework }} ${{ matrix.framework == 'trtllm' && 'CUDA13 ' || '' }}(${{ matrix.arch.arch }})"
echo "Looking for job pattern: '$BUILD_JOB_PATTERN'"
# Query GitHub API for job status
echo ""
echo "Querying GitHub API..."
JOBS_RESPONSE=$(curl -s -S -L -w "\n%{http_code}" \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100")
HTTP_CODE=$(echo "$JOBS_RESPONSE" | tail -n1)
JOBS=$(echo "$JOBS_RESPONSE" | sed '$d')
echo "HTTP Response Code: $HTTP_CODE"
if [ "$HTTP_CODE" != "200" ]; then
echo "Error: GitHub API returned non-200 status code"
echo "Response: $JOBS"
exit 1
fi
# Debug: Show total jobs and all job names
TOTAL_JOBS=$(echo "$JOBS" | jq '.jobs | length')
echo ""
echo "Total jobs found: $TOTAL_JOBS"
echo ""
echo "All job names in this workflow run:"
echo "$JOBS" | jq -r '.jobs[] | " - \(.name) [status: \(.status), conclusion: \(.conclusion)]"'
echo ""
# Try exact endswith match
echo "Searching for jobs ending with: '$BUILD_JOB_PATTERN'"
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | endswith($pattern))] | length')
echo "Jobs matching endswith pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -eq 0 ]; then
echo ""
echo "WARNING: No jobs found with endswith pattern"
echo "Trying contains pattern instead..."
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | contains($pattern))] | length')
echo "Jobs matching contains pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -gt 0 ]; then
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .name' | head -1)
fi
else
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .name' | head -1)
fi
echo ""
echo "==========================================="
echo "RESULT:"
echo " Matched job: ${MATCHED_JOB_NAME:-none}"
echo " Build status: ${BUILD_STATUS:-not found}"
echo "==========================================="
# Handle various status cases
if [ -z "$BUILD_STATUS" ] || [ "$BUILD_STATUS" = "null" ] || [ "$BUILD_STATUS" = "" ]; then
echo ""
echo "ERROR: Could not determine build status"
echo "This could mean:"
echo " 1. The build job is still running"
echo " 2. The job name pattern doesn't match"
echo " 3. The API response doesn't include this job yet"
exit 1
fi
if [ "$BUILD_STATUS" != "success" ]; then
echo ""
echo "ERROR: Build did not succeed (status: $BUILD_STATUS)"
exit 1
fi
echo ""
echo "✅ Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run E2E Tests (gpu_1)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: "${{ matrix.framework }} and e2e and gpu_1"
framework: ${{ matrix.framework }}
test_type: e2e-single-gpu
platform_arch: ${{ matrix.arch.arch }}
dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }}
e2e-multi-gpu-tests:
name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-2gpu-e2e
needs: [build-amd64, build-arm64, build-cuda13-arm64, build-cuda13-amd64]
if: always()
runs-on: ${{ matrix.arch.runner }}
timeout-minutes: ${{ matrix.arch.timeout }}
strategy:
fail-fast: false
matrix:
framework: [vllm, trtllm, sglang]
arch:
- arch: amd64
runner: prod-builder-amd-gpu-v1
timeout: 150
- arch: arm64
runner: prod-builder-arm-v1
timeout: 150
steps:
- uses: actions/checkout@v4
with:
lfs: true
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
echo "==========================================="
echo "DEBUG: Checking build status"
echo "==========================================="
echo "Framework: ${{ matrix.framework }}"
echo "Architecture: ${{ matrix.arch.arch }}"
echo "Repository: ${{ github.repository }}"
echo "Run ID: ${{ github.run_id }}"
BUILD_JOB_PATTERN="Build ${{ matrix.framework }} ${{ matrix.framework == 'trtllm' && 'CUDA13 ' || '' }}(${{ matrix.arch.arch }})"
echo "Looking for job pattern: '$BUILD_JOB_PATTERN'"
# Query GitHub API for job status
echo ""
echo "Querying GitHub API..."
JOBS_RESPONSE=$(curl -s -S -L -w "\n%{http_code}" \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100")
HTTP_CODE=$(echo "$JOBS_RESPONSE" | tail -n1)
JOBS=$(echo "$JOBS_RESPONSE" | sed '$d')
echo "HTTP Response Code: $HTTP_CODE"
if [ "$HTTP_CODE" != "200" ]; then
echo "Error: GitHub API returned non-200 status code"
echo "Response: $JOBS"
exit 1
fi
# Debug: Show total jobs and all job names
TOTAL_JOBS=$(echo "$JOBS" | jq '.jobs | length')
echo ""
echo "Total jobs found: $TOTAL_JOBS"
echo ""
echo "All job names in this workflow run:"
echo "$JOBS" | jq -r '.jobs[] | " - \(.name) [status: \(.status), conclusion: \(.conclusion)]"'
echo ""
# Try exact endswith match
echo "Searching for jobs ending with: '$BUILD_JOB_PATTERN'"
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | endswith($pattern))] | length')
echo "Jobs matching endswith pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -eq 0 ]; then
echo ""
echo "WARNING: No jobs found with endswith pattern"
echo "Trying contains pattern instead..."
MATCHING_JOBS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '[.jobs[] | select(.name | contains($pattern))] | length')
echo "Jobs matching contains pattern: $MATCHING_JOBS"
if [ "$MATCHING_JOBS" -gt 0 ]; then
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .name' | head -1)
fi
else
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .conclusion' | head -1)
MATCHED_JOB_NAME=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | endswith($pattern)) | .name' | head -1)
fi
echo ""
echo "==========================================="
echo "RESULT:"
echo " Matched job: ${MATCHED_JOB_NAME:-none}"
echo " Build status: ${BUILD_STATUS:-not found}"
echo "==========================================="
# Handle various status cases
if [ -z "$BUILD_STATUS" ] || [ "$BUILD_STATUS" = "null" ] || [ "$BUILD_STATUS" = "" ]; then
echo ""
echo "ERROR: Could not determine build status"
echo "This could mean:"
echo " 1. The build job is still running"
echo " 2. The job name pattern doesn't match"
echo " 3. The API response doesn't include this job yet"
exit 1
fi
if [ "$BUILD_STATUS" != "success" ]; then
echo ""
echo "ERROR: Build did not succeed (status: $BUILD_STATUS)"
exit 1
fi
echo ""
echo "✅ Build succeeded. Proceeding with tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
- name: Pull image
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
IMAGE_TAG: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
run: |
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG}
docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG}
- name: Run E2E Tests (gpu_2)
uses: ./.github/actions/pytest
with:
image_tag: ${{ env.IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }}
pytest_marks: ${{ inputs.include_nightly_marks && '(nightly or post_merge or pre_merge) and e2e and gpu_2' || '(post_merge or pre_merge) and e2e and gpu_2' }}
framework: ${{ matrix.framework }}
test_type: e2e-multi-gpu
platform_arch: ${{ matrix.arch.arch }}
dry_run: 'true'
############################## FAULT TOLERANCE TESTS ##############################
fault-tolerance-tests:
name: ${{ matrix.framework.name }}-ft-k8s
needs: [build-amd64]
if: always() && inputs.pipeline_type == 'nightly'
runs-on: prod-builder-amd-v1
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
framework:
- name: vllm
test_scenario: "vllm-agg"
- name: trtllm
test_scenario: "trtllm-agg"
- name: sglang
test_scenario: "sglang-agg"
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
NIGHTLY_IMAGE_PREFIX: ${{ inputs.image_prefix }}
NAMESPACE: ft-${{ matrix.framework.name }}-${{ github.run_id }}-${{ github.run_attempt }}
DYNAMO_INGRESS_SUFFIX: ${{ secrets.DYNAMO_INGRESS_SUFFIX }}
steps:
- uses: actions/checkout@v4
- name: Check if build succeeded
id: check_build
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set +x
BUILD_JOB_PATTERN="Build ${{ matrix.framework.name }} (amd64)"
JOBS_RESPONSE=$(curl -s -S -L -w "\n%{http_code}" \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100")
HTTP_CODE=$(echo "$JOBS_RESPONSE" | tail -n1)
JOBS=$(echo "$JOBS_RESPONSE" | sed '$d')
if [ "$HTTP_CODE" != "200" ]; then
echo "Error: GitHub API returned non-200 status code"
exit 1
fi
BUILD_STATUS=$(echo "$JOBS" | jq -r --arg pattern "$BUILD_JOB_PATTERN" '.jobs[] | select(.name | contains($pattern)) | .conclusion' | head -1)
if [ -z "$BUILD_STATUS" ] || [ "$BUILD_STATUS" = "null" ] || [ "$BUILD_STATUS" != "success" ]; then
echo "ERROR: Build did not succeed (status: $BUILD_STATUS)"
exit 1
fi
echo "✅ Build succeeded. Proceeding with fault tolerance tests."
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Setup Kubernetes
run: |
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl cluster-info
- name: Deploy Operator
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Pull operator image (using nightly tag for operator too)
export ECR_HOSTNAME=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-operator-amd64 || echo "Operator image not found, will use SHA-based tag"
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
# Install platform with namespace restriction
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--timeout 10m --wait
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
- name: Run Fault Tolerance Tests
id: run-ft-tests
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
export NAMESPACE=$NAMESPACE
export FRAMEWORK=${{ matrix.framework.name }}
export ECR_HOSTNAME=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
export IMAGE="${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${FRAMEWORK}-amd64"
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
echo "Using namespace: $NAMESPACE"
echo "Using image tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${FRAMEWORK}-amd64"
# Install python3-venv package if not already installed
sudo apt-get update && sudo apt-get install -y python3-venv
# Set up Python virtual environment and install test dependencies
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r container/deps/requirements.test.txt
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
# Create test-results directory
mkdir -p test-results
# Run the pytest command with JUnit XML output
set +e # Don't exit on test failures
pytest tests/fault_tolerance/deploy/test_deployment.py \
-m 'k8s and fault_tolerance' \
-k '${{ matrix.framework.test_scenario }}' \
-s -v \
--namespace ${NAMESPACE} \
--image ${IMAGE} \
--client-type legacy \
--junitxml=test-results/pytest_ft_report.xml \
--tb=short
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
continue-on-error: true
- name: Process Fault Tolerance Test Results
if: always()
run: |
set -x
# Rename JUnit XML with unique naming if it exists
if [ -f "test-results/pytest_ft_report.xml" ]; then
mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
echo "✅ JUnit XML report renamed with unique identifier"
else
echo "⚠️ JUnit XML report not found"
fi
- name: Upload Fault Tolerance Test Results
uses: actions/upload-artifact@v4
if: always()
with:
name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7
- name: Cleanup
if: always()
timeout-minutes: 5
run: |
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
############################## RESULTS SUMMARY ##############################
results-summary:
name: Results Summary
runs-on: ubuntu-latest
if: always()
needs: [build-amd64, build-arm64, build-cuda13-amd64, build-cuda13-arm64, unit-tests, integration-tests, e2e-single-gpu-tests, e2e-multi-gpu-tests, fault-tolerance-tests]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Gather job metadata
id: gather
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PIPELINE_TYPE: ${{ inputs.pipeline_type }}
run: |
set +x -e
echo "# ${PIPELINE_TYPE^} CI Results Summary" > results.md
echo "" >> results.md
echo "| Stage | Status | Runner | Duration (min) | Artifacts |" >> results.md
echo "|-------|--------|--------|----------------|-----------|" >> results.md
curl -s -S -L --fail-with-body \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
2>/dev/null | jq -c '.jobs[] | {id, name, runner_name, conclusion, started_at, completed_at}' > jobs.jsonl
while read job_entry; do
job_id=$(echo "$job_entry" | jq -r '.id')
name=$(echo "$job_entry" | jq -r '.name')
runner=$(echo "$job_entry" | jq -r '.runner_name')
status=$(echo "$job_entry" | jq -r '.conclusion')
started=$(echo "$job_entry" | jq -r '.started_at')
completed=$(echo "$job_entry" | jq -r '.completed_at')
minutes="N/A"
if [[ "$started" != "null" && "$completed" != "null" ]]; then
start_epoch=$(date -d "$started" +%s)
end_epoch=$(date -d "$completed" +%s)
minutes=$(( (end_epoch - start_epoch)/60 ))
fi
artifact_link="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#job-$job_id"
printf "| %s | %s | %s | %s | [Log & Artifacts](%s) |\n" "$name" "$status" "$runner" "$minutes" "$artifact_link" >> results.md
done < jobs.jsonl
echo "" >> results.md
echo "---" >> results.md
- name: Display workflow summary
run: cat results.md
- name: Upload results summary as job summary
run: cat results.md >> $GITHUB_STEP_SUMMARY
- name: Upload results as artifact
uses: actions/upload-artifact@v4
if: always()
with:
name: ${{ inputs.pipeline_type }}-results-summary
path: results.md
retention-days: 7
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-builder-amd-v1
if: always() && inputs.enable_slack_notification && !github.event.repository.fork
needs: results-summary
permissions:
contents: read
env:
HAS_SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL != '' }}
steps:
- name: Send Slack notification
if: env.HAS_SLACK_WEBHOOK == 'true'
continue-on-error: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
SLACK_OPS_GROUP_ID: ${{ secrets.SLACK_OPS_SUPPORT_GROUP_ID }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
PIPELINE_TYPE: ${{ inputs.pipeline_type }}
run: |
set -euo pipefail
JOBS_JSON=$(mktemp)
trap 'rm -f "$JOBS_JSON"' EXIT
if ! curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
> "$JOBS_JSON"; then
echo "Error: Failed to fetch job data from GitHub API"
exit 1
fi
if [ ! -s "$JOBS_JSON" ]; then
echo "Error: No job data received"
exit 1
fi
TOTAL_JOBS=$(jq '[.jobs[]] | length' "$JOBS_JSON")
SUCCESS_COUNT=$(jq '[.jobs[] | select(.conclusion == "success")] | length' "$JOBS_JSON")
FAILED_COUNT=$(jq '[.jobs[] | select(.conclusion == "failure")] | length' "$JOBS_JSON")
if [ "$FAILED_COUNT" -eq 0 ]; then
STATUS="Success ✅"
STATUS_EMOJI=":white_check_mark:"
else
STATUS="Failed ❌"
STATUS_EMOJI=":x:"
fi
# Capitalize pipeline type for display
DISPLAY_TYPE="${PIPELINE_TYPE^}"
# Main message with summary
SUMMARY_TEXT="*${DISPLAY_TYPE} CI Pipeline - ${STATUS}*"$'\n'"Summary: ${SUCCESS_COUNT}/${TOTAL_JOBS} jobs passed"$'\n'"<${RUN_URL}|View Workflow Summary>"
if [ "$FAILED_COUNT" -eq 0 ]; then
# Success - simple message
PAYLOAD=$(jq -n \
--arg text "$SUMMARY_TEXT" \
'{text: $text}')
else
# Failed - message with blocks
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | "• " + .name' "$JOBS_JSON")
FAILED_JOBS_TEXT="*Failed Jobs (${FAILED_COUNT}):*"$'\n'"${FAILED_JOBS}"
# Build ops-support mention (use group ID if available, otherwise plain text)
if [ -n "${SLACK_OPS_GROUP_ID:-}" ]; then
OPS_MENTION="<!subteam^${SLACK_OPS_GROUP_ID}|@ops-support>"
else
OPS_MENTION="@ops-support"
fi
ACTION_TEXT=":rotating_light: cc ${OPS_MENTION} - Please investigate the failures above."
PAYLOAD=$(jq -n \
--arg summary "$SUMMARY_TEXT" \
--arg failed "$FAILED_JOBS_TEXT" \
--arg action "$ACTION_TEXT" \
'{
text: $summary,
blocks: [
{
type: "section",
text: {
type: "mrkdwn",
text: $summary
}
},
{
type: "section",
text: {
type: "mrkdwn",
text: $failed
}
},
{
type: "divider"
},
{
type: "context",
elements: [
{
type: "mrkdwn",
text: $action
}
]
}
]
}')
fi
if curl -sSf -X POST -H "Content-Type: application/json" -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"; then
echo "Slack notification sent successfully"
else
echo "Warning: Failed to send Slack notification"
exit 1
fi
...@@ -20,8 +20,15 @@ on: ...@@ -20,8 +20,15 @@ on:
permissions: permissions:
contents: write contents: write
env:
REGISTRY_IMAGE: ai-dynamo/dynamo
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs: jobs:
# Gate job for manual triggers - requires automated-release approval # ============================================================================
# GATE: Approval + Version Extraction
# ============================================================================
manual-approval: manual-approval:
name: Approve Manual Run name: Approve Manual Run
if: github.event_name == 'workflow_dispatch' if: github.event_name == 'workflow_dispatch'
...@@ -29,9 +36,8 @@ jobs: ...@@ -29,9 +36,8 @@ jobs:
environment: automated-release environment: automated-release
steps: steps:
- name: Manual run approved - name: Manual run approved
run: echo "Manual workflow run approved via automated-release environment" run: echo "Manual workflow run approved via automated-release environment"
# Extract version from branch name for downstream jobs
prepare-release: prepare-release:
name: Prepare Release name: Prepare Release
runs-on: ubuntu-latest runs-on: ubuntu-latest
...@@ -42,11 +48,9 @@ jobs: ...@@ -42,11 +48,9 @@ jobs:
- name: Extract version from branch - name: Extract version from branch
id: extract id: extract
run: | run: |
# Extract version from branch name (e.g., release/0.7.0 -> 0.7.0)
BRANCH_NAME="${GITHUB_REF#refs/heads/}" BRANCH_NAME="${GITHUB_REF#refs/heads/}"
VERSION="${BRANCH_NAME#release/}" VERSION="${BRANCH_NAME#release/}"
# Enforce workflow_dispatch only runs on release/* branches
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then if [[ ! "$BRANCH_NAME" =~ ^release/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Error: workflow_dispatch can only be triggered from release/* branches" echo "Error: workflow_dispatch can only be triggered from release/* branches"
...@@ -66,37 +70,153 @@ jobs: ...@@ -66,37 +70,153 @@ jobs:
echo "image_prefix=release-${VERSION}" >> $GITHUB_OUTPUT echo "image_prefix=release-${VERSION}" >> $GITHUB_OUTPUT
echo "Detected version: ${VERSION}" echo "Detected version: ${VERSION}"
# Run the CI test suite (builds + tests) # ============================================================================
ci-pipeline: # FRAMEWORK PIPELINES (Build + Test + Distribute)
name: Release CI # Builds amd64+arm64 images, runs tests, copies amd64 to ACR.
# release-publish then copies both architectures from ECR to NGC.
#
# NOTE: Each job directly depends on [prepare-release, manual-approval] with
# always() instead of going through an intermediate gate job. This avoids a
# GitHub Actions quirk where a skipped ancestor (manual-approval on push
# events) taints the entire dependency chain, causing downstream jobs to skip
# even when the intermediate gate succeeds.
# ============================================================================
vllm-pipeline:
name: vllm builds
needs: [prepare-release, manual-approval] needs: [prepare-release, manual-approval]
# Run if: prepare-release succeeded AND (push event OR manual-approval succeeded)
if: | if: |
always() && always() &&
needs.prepare-release.result == 'success' && needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success') (github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/ci-test-suite.yml uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with: with:
pipeline_type: release framework: vllm
include_nightly_marks: false target: runtime
image_prefix: ${{ needs.prepare-release.outputs.image_prefix }} platforms: '["amd64", "arm64"]'
enable_slack_notification: false cuda_versions: '["12.9", "13.0"]'
secrets: extra_tags: |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} ${{ needs.prepare-release.outputs.image_prefix }}-vllm
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} build_timeout_minutes: 120
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
NGC_CI_ACCESS_TOKEN: ${{ secrets.NGC_CI_ACCESS_TOKEN }} cpu_only_test_timeout_minutes: 60
CI_TOKEN: ${{ secrets.CI_TOKEN }} single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1'
SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }} single_gpu_test_timeout_minutes: 60
AZURE_ACR_HOSTNAME: ${{ secrets.AZURE_ACR_HOSTNAME }} multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)'
AZURE_ACR_USER: ${{ secrets.AZURE_ACR_USER }} multi_gpu_test_timeout_minutes: 60
AZURE_ACR_PASSWORD: ${{ secrets.AZURE_ACR_PASSWORD }} secrets: inherit
AZURE_AKS_CI_KUBECONFIG_B64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
HF_TOKEN: ${{ secrets.HF_TOKEN }} sglang-pipeline:
DYNAMO_INGRESS_SUFFIX: ${{ secrets.DYNAMO_INGRESS_SUFFIX }} name: sglang builds
needs: [prepare-release, manual-approval]
# Build frontend images (needed for NGC publish) if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-sglang
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0'
cpu_only_test_timeout_minutes: 60
single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1'
single_gpu_test_timeout_minutes: 60
multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit
trtllm-pipeline:
name: trtllm builds
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ needs.prepare-release.outputs.image_prefix }}-trtllm
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: 120
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
cpu_only_test_timeout_minutes: 60
single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1'
single_gpu_test_timeout_minutes: 60
multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# RELEASE-SPECIFIC BUILDS
# ============================================================================
operator-build:
name: Build Operator Image
needs: [prepare-release, manual-approval]
if: |
always() &&
needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success')
runs-on: prod-default-v2
env:
IMAGE_REGISTRY: ai-dynamo
IMAGE_REPOSITORY: dynamo
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
outputs:
operator_tag: ${{ steps.build-and-push.outputs.operator_tag }}
steps:
- uses: actions/checkout@v4
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ env.BUILDER_NAME }}
flavor: general
all_arch: 'true'
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Linter
working-directory: ./deploy/operator
run: docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Tester
working-directory: ./deploy/operator
run: docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
- name: Build and push Container
id: build-and-push
working-directory: ./deploy/operator
run: |
ECR_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
ACR_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}"
SHA_TAG="${{ github.sha }}-operator"
PREFIX_TAG="${{ needs.prepare-release.outputs.image_prefix }}-operator"
IMAGE_URIS=(
"${ECR_BASE}:${SHA_TAG}"
"${ECR_BASE}:${PREFIX_TAG}"
"${ACR_BASE}:${SHA_TAG}"
"${ACR_BASE}:${PREFIX_TAG}"
)
echo "operator_tag=${PREFIX_TAG}" >> $GITHUB_OUTPUT
TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}")
docker buildx build --push --platform linux/amd64,linux/arm64 \
--build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
${TAGGING_FLAGS} -f Dockerfile .
frontend-build: frontend-build:
name: Build Frontend Images name: Build Frontend Images
needs: [prepare-release, manual-approval] needs: [prepare-release, manual-approval]
...@@ -105,30 +225,193 @@ jobs: ...@@ -105,30 +225,193 @@ jobs:
needs.prepare-release.result == 'success' && needs.prepare-release.result == 'success' &&
(github.event_name == 'push' || needs.manual-approval.result == 'success') (github.event_name == 'push' || needs.manual-approval.result == 'success')
uses: ./.github/workflows/build-frontend-image.yaml uses: ./.github/workflows/build-frontend-image.yaml
secrets: with:
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} skip_change_detection: true
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} image_prefix: ${{ needs.prepare-release.outputs.image_prefix }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} secrets: inherit
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AZURE_ACR_HOSTNAME: ${{ secrets.AZURE_ACR_HOSTNAME }} # ============================================================================
AZURE_ACR_USER: ${{ secrets.AZURE_ACR_USER }} # BUILDER CLEANUP
AZURE_ACR_PASSWORD: ${{ secrets.AZURE_ACR_PASSWORD }} # ============================================================================
CI_TOKEN: ${{ secrets.CI_TOKEN }}
SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }} clean-k8s-builder:
name: Clean K8s builder if exists
# Tag the commit as release candidate and publish to NGC runs-on: prod-default-small-v2
# This job uses the automated-release environment for sensitive secrets if: always()
# Runs after ci-pipeline and frontend-build complete - requires builds to succeed needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline]
# Note: Tests may fail but builds must succeed for publishing steps:
- uses: actions/checkout@v4
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
buildkit_worker_addresses: ''
skip_bootstrap: true
- name: Builder Cleanup
run: docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true
# ============================================================================
# DEPLOYMENT TESTS
# ============================================================================
deploy-operator:
name: Deploy Operator
runs-on: prod-default-small-v2
needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator-build]
if: |
always() &&
needs.operator-build.result == 'success'
outputs:
NAMESPACE: ${{ steps.deploy.outputs.namespace }}
steps:
- uses: actions/checkout@v4
- name: Deploy Operator
id: deploy
run: |
set -x
BRANCH="${{ github.ref_name }}"
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-ci-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
kubectl create namespace $NAMESPACE
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
kubectl config set-context --current --namespace=$NAMESPACE
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $NAMESPACE || true
kubectl create secret docker-registry docker-imagepullsecret \
--docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} \
--docker-username=${{ secrets.AZURE_ACR_USER }} \
--docker-password=${{ secrets.AZURE_ACR_PASSWORD }} \
--namespace=${NAMESPACE}
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ needs.prepare-release.outputs.image_prefix }}-operator \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--set dynamo-operator.gpuDiscovery.enabled=false \
--set dynamo-operator.upgradeCRD=false \
--debug
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
deploy-test-vllm:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, vllm-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router, disagg, disagg_router]
name: deploy-test-vllm (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: vllm
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-amd64
platform_arch: amd64
deploy-test-sglang:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, sglang-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router]
name: deploy-test-sglang (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: sglang
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-amd64
platform_arch: amd64
deploy-test-trtllm:
if: always() && needs.deploy-operator.result == 'success'
runs-on: prod-default-small-v2
needs: [deploy-operator, trtllm-pipeline]
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile: [agg, agg_router, disagg, disagg_router]
name: deploy-test-trtllm (${{ matrix.profile }})
steps:
- uses: actions/checkout@v4
- name: Run Dynamo Deploy Test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
framework: trtllm
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-amd64
platform_arch: amd64
deploy-cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-sglang, deploy-test-trtllm, deploy-test-vllm]
steps:
- uses: actions/checkout@v4
- name: Cleanup
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
if [ -z "$NAMESPACE" ]; then
echo "No namespace to clean up"
exit 0
fi
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl get dynamographdeployments || true
kubectl get all || true
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m || true
kubectl delete namespace $NAMESPACE || true
# ============================================================================
# NGC PUBLISH: RC tag, crane copy to NGC, Helm chart push
# Runs after framework builds + operator + frontend complete.
# Tests may fail but builds must have produced images for publishing.
# ============================================================================
release-publish: release-publish:
name: Tag RC & Publish to NGC name: Tag RC & Publish to NGC
needs: [prepare-release, ci-pipeline, frontend-build] needs: [prepare-release, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator-build, frontend-build]
if: | if: |
always() && !cancelled() && always() && !cancelled() &&
needs.prepare-release.result == 'success' && needs.prepare-release.result == 'success' &&
(needs.ci-pipeline.result == 'success' || needs.ci-pipeline.result == 'failure') && (needs.vllm-pipeline.result == 'success' || needs.sglang-pipeline.result == 'success' || needs.trtllm-pipeline.result == 'success')
(needs.frontend-build.result == 'success' || needs.frontend-build.result == 'failure') runs-on: cpu-amd-m5-4xlarge
runs-on: cpu-amd-m5-4xlarge # Self-hosted runner with IAM instance role for ECR access
environment: automated-release environment: automated-release
env: env:
VERSION: ${{ needs.prepare-release.outputs.version }} VERSION: ${{ needs.prepare-release.outputs.version }}
...@@ -149,9 +432,7 @@ jobs: ...@@ -149,9 +432,7 @@ jobs:
run: | run: |
set -euo pipefail set -euo pipefail
# Check if RC number was provided as input
if [ -n "${INPUT_RC_NUMBER}" ]; then if [ -n "${INPUT_RC_NUMBER}" ]; then
# Validate input is a non-negative integer
if ! [[ "${INPUT_RC_NUMBER}" =~ ^[0-9]+$ ]]; then if ! [[ "${INPUT_RC_NUMBER}" =~ ^[0-9]+$ ]]; then
echo "Error: rc_number must be a non-negative integer (got: ${INPUT_RC_NUMBER})" echo "Error: rc_number must be a non-negative integer (got: ${INPUT_RC_NUMBER})"
exit 1 exit 1
...@@ -159,21 +440,14 @@ jobs: ...@@ -159,21 +440,14 @@ jobs:
NEXT_RC="${INPUT_RC_NUMBER}" NEXT_RC="${INPUT_RC_NUMBER}"
echo "Using provided RC number: ${NEXT_RC}" echo "Using provided RC number: ${NEXT_RC}"
else else
# Auto-increment: Find existing RC tags for this version
echo "No RC number provided. Auto-incrementing..." echo "No RC number provided. Auto-incrementing..."
echo "Looking for existing RC tags for version ${VERSION}..."
# Pattern: vX.Y.Z-rcN
RC_PATTERN="v${VERSION}-rc" RC_PATTERN="v${VERSION}-rc"
# Get all matching tags sorted by RC number
EXISTING_RCS=$(git tag -l "${RC_PATTERN}*" | grep -E "^v${VERSION}-rc[0-9]+$" | sort -V || true) EXISTING_RCS=$(git tag -l "${RC_PATTERN}*" | grep -E "^v${VERSION}-rc[0-9]+$" | sort -V || true)
if [ -z "$EXISTING_RCS" ]; then if [ -z "$EXISTING_RCS" ]; then
NEXT_RC=0 NEXT_RC=0
echo "No existing RC tags found. Starting with rc0." echo "No existing RC tags found. Starting with rc0."
else else
# Get the highest RC number
LAST_RC=$(echo "$EXISTING_RCS" | tail -1) LAST_RC=$(echo "$EXISTING_RCS" | tail -1)
LAST_RC_NUM=${LAST_RC#v${VERSION}-rc} LAST_RC_NUM=${LAST_RC#v${VERSION}-rc}
NEXT_RC=$((LAST_RC_NUM + 1)) NEXT_RC=$((LAST_RC_NUM + 1))
...@@ -187,6 +461,7 @@ jobs: ...@@ -187,6 +461,7 @@ jobs:
echo "rc_tag=${RC_TAG}" >> $GITHUB_OUTPUT echo "rc_tag=${RC_TAG}" >> $GITHUB_OUTPUT
echo "rc_number=${NEXT_RC}" >> $GITHUB_OUTPUT echo "rc_number=${NEXT_RC}" >> $GITHUB_OUTPUT
echo "ngc_version_tag=${VERSION}rc${NEXT_RC}" >> $GITHUB_OUTPUT echo "ngc_version_tag=${VERSION}rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "helm_chart_version=${VERSION}-rc${NEXT_RC}" >> $GITHUB_OUTPUT
echo "Will create tag: ${RC_TAG}" echo "Will create tag: ${RC_TAG}"
- name: Create RC tag - name: Create RC tag
...@@ -194,23 +469,16 @@ jobs: ...@@ -194,23 +469,16 @@ jobs:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }} RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
run: | run: |
set -euo pipefail set -euo pipefail
git config user.name "github-actions[bot]" git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com" git config user.email "github-actions[bot]@users.noreply.github.com"
# Create annotated tag
git tag -a "${RC_TAG}" -m "Release candidate ${RC_TAG}" git tag -a "${RC_TAG}" -m "Release candidate ${RC_TAG}"
# Push the tag
git push origin "${RC_TAG}" git push origin "${RC_TAG}"
echo "Created and pushed tag: ${RC_TAG}"
echo "✅ Created and pushed tag: ${RC_TAG}"
- name: Setup crane - name: Setup crane
env: env:
CRANE_VERSION: v0.20.2 CRANE_VERSION: v0.20.2
run: | run: |
# Download crane from official Google releases
curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \ curl -sL "https://github.com/google/go-containerregistry/releases/download/${CRANE_VERSION}/go-containerregistry_Linux_x86_64.tar.gz" \
| tar -xzf - crane | tar -xzf - crane
sudo mv crane /usr/local/bin/ sudo mv crane /usr/local/bin/
...@@ -220,9 +488,7 @@ jobs: ...@@ -220,9 +488,7 @@ jobs:
run: | run: |
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
echo "Logging into ECR..."
aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}" aws ecr get-login-password --region ${AWS_DEFAULT_REGION} | docker login --username AWS --password-stdin "${ECR_HOSTNAME}"
echo "✅ ECR login successful"
- name: Login to NGC - name: Login to NGC
env: env:
...@@ -236,143 +502,127 @@ jobs: ...@@ -236,143 +502,127 @@ jobs:
env: env:
NGC_REGISTRY: nvcr.io NGC_REGISTRY: nvcr.io
NGC_ORG: ${{ secrets.NGC_PUBLISH_ORG }} NGC_ORG: ${{ secrets.NGC_PUBLISH_ORG }}
RC_NUMBER: ${{ steps.rc_tag.outputs.rc_number }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }} NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
CI_PIPELINE_RESULT: ${{ needs.ci-pipeline.result }}
FRONTEND_BUILD_RESULT: ${{ needs.frontend-build.result }}
run: | run: |
set -euo pipefail set -euo pipefail
# Track success/failure for summary
SUCCESSFUL_COPIES=() SUCCESSFUL_COPIES=()
FAILED_COPIES=() FAILED_COPIES=()
# Get ECR hostname from instance role
ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com" ECR_HOSTNAME="${ACCOUNT_ID}.dkr.ecr.${AWS_DEFAULT_REGION}.amazonaws.com"
FRAMEWORKS=("vllm" "trtllm" "sglang")
ARCHITECTURES=("amd64" "arm64") ARCHITECTURES=("amd64" "arm64")
echo "========================================" echo "========================================"
echo "Build Status:"
echo " CI Pipeline: ${CI_PIPELINE_RESULT}"
echo " Frontend Build: ${FRONTEND_BUILD_RESULT}"
echo "========================================"
echo ""
echo "Copying images from ECR to NGC (registry-to-registry)" echo "Copying images from ECR to NGC (registry-to-registry)"
echo "NGC Version Tag: ${NGC_VERSION_TAG}" echo "NGC Version Tag: ${NGC_VERSION_TAG}"
echo "========================================"
copy_image() {
local SRC="$1" DST="$2" LABEL="$3"
echo "----------------------------------------"
echo "Copying: ${LABEL}"
if crane copy "${SRC}" "${DST}"; then
echo " Copied: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL}")
return 0
else
echo " Warning: Failed to copy ${LABEL}, skipping..."
FAILED_COPIES+=("${LABEL}")
return 1
fi
}
create_manifest() {
local MANIFEST="$1" AMD64_IMG="$2" ARM64_IMG="$3" LABEL="$4"
echo "Creating manifest: ${MANIFEST}"
docker manifest create "${MANIFEST}" "${AMD64_IMG}" "${ARM64_IMG}" || true
if docker manifest push "${MANIFEST}"; then
echo " Created multi-arch: ${LABEL}"
SUCCESSFUL_COPIES+=("${LABEL} (multi-arch)")
else
echo " Failed to create multi-arch: ${LABEL}"
FAILED_COPIES+=("${LABEL} (multi-arch)")
fi
}
# Copy runtime images (from ci-test-suite.yml) # ---- CUDA 12 runtime images (vllm and sglang) ----
for FRAMEWORK in "${FRAMEWORKS[@]}"; do echo ""
echo "=== CUDA 12 Runtime Images (vllm, sglang) ==="
CUDA12_FRAMEWORKS=("vllm" "sglang")
for FRAMEWORK in "${CUDA12_FRAMEWORKS[@]}"; do
NGC_NAME="${FRAMEWORK}-runtime"
for ARCH in "${ARCHITECTURES[@]}"; do for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE_TAG="${IMAGE_PREFIX}-${FRAMEWORK}-${ARCH}" SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda12-${ARCH}"
SOURCE_IMAGE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${SOURCE_TAG}" TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
NGC_TAG="${NGC_VERSION_TAG}-${ARCH}" copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-${ARCH}"
NGC_IMAGE="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${FRAMEWORK}-runtime:${NGC_TAG}"
echo "----------------------------------------"
echo "Copying: ${FRAMEWORK}-runtime:${NGC_TAG}"
if crane copy "${SOURCE_IMAGE}" "${NGC_IMAGE}"; then
echo "✅ Copied: ${FRAMEWORK}-runtime:${NGC_TAG}"
SUCCESSFUL_COPIES+=("${FRAMEWORK}-runtime:${NGC_TAG}")
else
echo "⚠️ Warning: Failed to copy ${FRAMEWORK} (${ARCH}), skipping..."
FAILED_COPIES+=("${FRAMEWORK}-runtime:${NGC_TAG}")
fi
done done
create_manifest \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-amd64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-arm64" \
"${NGC_NAME}:${NGC_VERSION_TAG}"
done done
# Copy CUDA 13 images - both architectures # ---- CUDA 13 runtime images (vllm, sglang, trtllm) ----
echo "" echo ""
echo "Copying CUDA 13 images from ECR..." echo "=== CUDA 13 Runtime Images (vllm, sglang, trtllm) ==="
CUDA13_FRAMEWORKS=("vllm" "sglang") CUDA13_FRAMEWORKS=("vllm" "sglang" "trtllm")
for FRAMEWORK in "${CUDA13_FRAMEWORKS[@]}"; do for FRAMEWORK in "${CUDA13_FRAMEWORKS[@]}"; do
if [ "${FRAMEWORK}" = "trtllm" ]; then
NGC_NAME="tensorrtllm-runtime"
else
NGC_NAME="${FRAMEWORK}-runtime"
fi
for ARCH in "${ARCHITECTURES[@]}"; do for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE_TAG="${IMAGE_PREFIX}-${FRAMEWORK}-cuda13-${ARCH}" SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-${FRAMEWORK}-cuda13-${ARCH}"
SOURCE_IMAGE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${SOURCE_TAG}" TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
NGC_TAG="${NGC_VERSION_TAG}-cuda13-${ARCH}" copy_image "${SOURCE}" "${TARGET}" "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-${ARCH}"
NGC_IMAGE="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${FRAMEWORK}-runtime:${NGC_TAG}"
echo "----------------------------------------"
echo "Copying: ${FRAMEWORK}-runtime:${NGC_TAG}"
if crane copy "${SOURCE_IMAGE}" "${NGC_IMAGE}"; then
echo "✅ Copied: ${FRAMEWORK}-runtime:${NGC_TAG}"
SUCCESSFUL_COPIES+=("${FRAMEWORK}-runtime:${NGC_TAG}")
else
echo "⚠️ Warning: Failed to copy ${FRAMEWORK} CUDA13 (${ARCH}), skipping..."
FAILED_COPIES+=("${FRAMEWORK}-runtime:${NGC_TAG}")
fi
done done
# Create multi-arch manifest create_manifest \
MULTIARCH="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13" "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13" \
echo "Creating manifest: ${MULTIARCH}" "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-amd64" \
docker manifest create "${MULTIARCH}" \ "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${NGC_NAME}:${NGC_VERSION_TAG}-cuda13-arm64" \
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13-amd64" \ "${NGC_NAME}:${NGC_VERSION_TAG}-cuda13"
"${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13-arm64" || true
if docker manifest push "${MULTIARCH}"; then
echo "✅ Created multi-arch: ${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13"
SUCCESSFUL_COPIES+=("${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13 (multi-arch)")
else
echo "⚠️ Failed to create ${FRAMEWORK} CUDA13 multi-arch"
FAILED_COPIES+=("${FRAMEWORK}-runtime:${NGC_VERSION_TAG}-cuda13 (multi-arch)")
fi
done done
# Copy frontend images from ECR (built by build-frontend-image.yaml) # ---- Frontend images ----
echo "" echo ""
echo "Copying frontend images from ECR..." echo "=== Frontend Images ==="
FRONTEND_IMAGES=() FRONTEND_IMAGES=()
for ARCH in "${ARCHITECTURES[@]}"; do for ARCH in "${ARCHITECTURES[@]}"; do
SOURCE_TAG="${{ github.sha }}-frontend-${ARCH}" SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-frontend-${ARCH}"
SOURCE_IMAGE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${SOURCE_TAG}" TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"
NGC_TAG="${NGC_VERSION_TAG}-${ARCH}" if copy_image "${SOURCE}" "${TARGET}" "dynamo-frontend:${NGC_VERSION_TAG}-${ARCH}"; then
NGC_IMAGE="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_TAG}" FRONTEND_IMAGES+=("${TARGET}")
echo "----------------------------------------"
echo "Copying: dynamo-frontend:${NGC_TAG}"
if crane copy "${SOURCE_IMAGE}" "${NGC_IMAGE}"; then
echo "✅ Copied: dynamo-frontend:${NGC_TAG}"
SUCCESSFUL_COPIES+=("dynamo-frontend:${NGC_TAG}")
FRONTEND_IMAGES+=("${NGC_IMAGE}")
else
echo "⚠️ Warning: Failed to copy dynamo-frontend (${ARCH}), skipping..."
FAILED_COPIES+=("dynamo-frontend:${NGC_TAG}")
fi fi
done done
# Create multi-arch manifest for frontend
echo ""
echo "Creating multi-arch manifest for dynamo-frontend..."
FRONTEND_MULTIARCH="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}"
if [ ${#FRONTEND_IMAGES[@]} -eq 2 ]; then if [ ${#FRONTEND_IMAGES[@]} -eq 2 ]; then
echo "Creating manifest index: ${FRONTEND_MULTIARCH}" create_manifest \
docker manifest create "${FRONTEND_MULTIARCH}" \ "${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/dynamo-frontend:${NGC_VERSION_TAG}" \
"${FRONTEND_IMAGES[0]}" \ "${FRONTEND_IMAGES[0]}" "${FRONTEND_IMAGES[1]}" \
"${FRONTEND_IMAGES[1]}" || true "dynamo-frontend:${NGC_VERSION_TAG}"
if docker manifest push "${FRONTEND_MULTIARCH}"; then
echo "✅ Created multi-arch manifest: dynamo-frontend:${NGC_VERSION_TAG}"
SUCCESSFUL_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch)")
else
echo "⚠️ Warning: Failed to create multi-arch manifest"
FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch)")
fi
else else
echo "⚠️ Warning: Not all architectures available, skipping multi-arch manifest" echo "Warning: Not all frontend architectures available, skipping multi-arch manifest"
FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch - missing archs)") FAILED_COPIES+=("dynamo-frontend:${NGC_VERSION_TAG} (multi-arch - missing archs)")
fi fi
# Output counts for summary # ---- Operator image (multi-arch manifest already built by operator-build) ----
echo ""
echo "=== Operator Image ==="
OPERATOR_SOURCE="${ECR_HOSTNAME}/${REGISTRY_IMAGE}:${IMAGE_PREFIX}-operator"
OPERATOR_TARGET="${NGC_REGISTRY}/${NGC_ORG}/ai-dynamo/kubernetes-operator:${NGC_VERSION_TAG}"
copy_image "${OPERATOR_SOURCE}" "${OPERATOR_TARGET}" "kubernetes-operator:${NGC_VERSION_TAG}"
# ---- Summary ----
echo "successful_count=${#SUCCESSFUL_COPIES[@]}" >> $GITHUB_OUTPUT echo "successful_count=${#SUCCESSFUL_COPIES[@]}" >> $GITHUB_OUTPUT
echo "failed_count=${#FAILED_COPIES[@]}" >> $GITHUB_OUTPUT echo "failed_count=${#FAILED_COPIES[@]}" >> $GITHUB_OUTPUT
# Save lists for summary (newline-separated)
printf '%s\n' "${SUCCESSFUL_COPIES[@]}" > /tmp/successful_copies.txt printf '%s\n' "${SUCCESSFUL_COPIES[@]}" > /tmp/successful_copies.txt
printf '%s\n' "${FAILED_COPIES[@]}" > /tmp/failed_copies.txt 2>/dev/null || true printf '%s\n' "${FAILED_COPIES[@]}" > /tmp/failed_copies.txt 2>/dev/null || true
...@@ -382,21 +632,60 @@ jobs: ...@@ -382,21 +632,60 @@ jobs:
echo " Failed: ${#FAILED_COPIES[@]}" echo " Failed: ${#FAILED_COPIES[@]}"
echo "========================================" echo "========================================"
# Fail the step if all copies failed
if [ ${#SUCCESSFUL_COPIES[@]} -eq 0 ]; then if [ ${#SUCCESSFUL_COPIES[@]} -eq 0 ]; then
echo "ERROR: No images were successfully copied to NGC!" echo "ERROR: No images were successfully copied to NGC!"
exit 1 exit 1
fi fi
- name: Package and push Helm charts to NGC
env:
NGC_HELM_REPO: https://helm.ngc.nvidia.com/${{ secrets.NGC_PUBLISH_ORG }}/ai-dynamo
NGC_TOKEN: ${{ secrets.NGC_PUBLISH_TOKEN }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
run: |
set -euo pipefail
REPO_ALIAS="ngc-staging-dynamo"
helm plugin install https://github.com/chartmuseum/helm-push || true
helm repo add "${REPO_ALIAS}" \
--username='$oauthtoken' \
--password="${NGC_TOKEN}" \
"${NGC_HELM_REPO}" > /dev/null 2>&1
helm repo add nats https://nats-io.github.io/k8s/helm/charts/ || true
helm repo add bitnami https://charts.bitnami.com/bitnami || true
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Helm Charts" >> $GITHUB_STEP_SUMMARY
PLATFORM_CHART_DIR="deploy/helm/charts/platform"
CHART_NAME=$(awk '/^name:/ {print $2}' "${PLATFORM_CHART_DIR}/Chart.yaml")
pushd "${PLATFORM_CHART_DIR}"
helm dep build .
popd
echo "Packaging ${CHART_NAME} with version ${HELM_CHART_VERSION}..."
helm package \
--version "${HELM_CHART_VERSION}" \
--app-version "${HELM_CHART_VERSION}" \
"${PLATFORM_CHART_DIR}"
CHART_FILE="${CHART_NAME}-${HELM_CHART_VERSION}.tgz"
echo "Pushing ${CHART_FILE} to NGC Helm registry..."
helm cm-push "${CHART_FILE}" "${REPO_ALIAS}"
echo "- \`${CHART_NAME}:${HELM_CHART_VERSION}\` pushed to NGC Helm registry" >> $GITHUB_STEP_SUMMARY
helm repo remove "${REPO_ALIAS}"
- name: Create release summary - name: Create release summary
env: env:
RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }} RC_TAG: ${{ steps.rc_tag.outputs.rc_tag }}
RC_NUMBER: ${{ steps.rc_tag.outputs.rc_number }}
NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }} NGC_VERSION_TAG: ${{ steps.rc_tag.outputs.ngc_version_tag }}
HELM_CHART_VERSION: ${{ steps.rc_tag.outputs.helm_chart_version }}
SUCCESSFUL_COUNT: ${{ steps.copy_images.outputs.successful_count }} SUCCESSFUL_COUNT: ${{ steps.copy_images.outputs.successful_count }}
FAILED_COUNT: ${{ steps.copy_images.outputs.failed_count }} FAILED_COUNT: ${{ steps.copy_images.outputs.failed_count }}
CI_PIPELINE_RESULT: ${{ needs.ci-pipeline.result }}
FRONTEND_BUILD_RESULT: ${{ needs.frontend-build.result }}
run: | run: |
echo "## Release Summary" >> $GITHUB_STEP_SUMMARY echo "## Release Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
...@@ -408,35 +697,28 @@ jobs: ...@@ -408,35 +697,28 @@ jobs:
echo "| Commit | ${{ github.sha }} |" >> $GITHUB_STEP_SUMMARY echo "| Commit | ${{ github.sha }} |" >> $GITHUB_STEP_SUMMARY
echo "| Branch | ${{ github.ref_name }} |" >> $GITHUB_STEP_SUMMARY echo "| Branch | ${{ github.ref_name }} |" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "### Build Status" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Pipeline | Result |" >> $GITHUB_STEP_SUMMARY
echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY
echo "| CI Pipeline | ${CI_PIPELINE_RESULT} |" >> $GITHUB_STEP_SUMMARY
echo "| Frontend Build | ${FRONTEND_BUILD_RESULT} |" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### NGC Publishing Results" >> $GITHUB_STEP_SUMMARY echo "### NGC Publishing Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Successful copies**: ${SUCCESSFUL_COUNT}" >> $GITHUB_STEP_SUMMARY echo "- **Successful copies**: ${SUCCESSFUL_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "- ⚠️ **Failed copies**: ${FAILED_COUNT}" >> $GITHUB_STEP_SUMMARY echo "- **Failed copies**: ${FAILED_COUNT}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "### Expected Images" >> $GITHUB_STEP_SUMMARY echo "### Expected Images" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 12):" >> $GITHUB_STEP_SUMMARY echo "Runtime images (CUDA 12 - default):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-{amd64,arm64}\`" >> $GITHUB_STEP_SUMMARY echo "- \`vllm-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`trtllm-runtime:${NGC_VERSION_TAG}-{amd64,arm64}\`" >> $GITHUB_STEP_SUMMARY echo "- \`sglang-runtime:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-{amd64,arm64}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "Runtime images (CUDA 13):" >> $GITHUB_STEP_SUMMARY echo "Runtime images (CUDA 13):" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch: amd64, arm64)" >> $GITHUB_STEP_SUMMARY echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13-amd64\`" >> $GITHUB_STEP_SUMMARY echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`vllm-runtime:${NGC_VERSION_TAG}-cuda13-arm64\`" >> $GITHUB_STEP_SUMMARY echo "- \`tensorrtllm-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13\` (multi-arch: amd64, arm64)" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13-amd64\`" >> $GITHUB_STEP_SUMMARY echo "Operator image:" >> $GITHUB_STEP_SUMMARY
echo "- \`sglang-runtime:${NGC_VERSION_TAG}-cuda13-arm64\`" >> $GITHUB_STEP_SUMMARY echo "- \`kubernetes-operator:${NGC_VERSION_TAG}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "Frontend images:" >> $GITHUB_STEP_SUMMARY echo "Frontend images:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-frontend:${NGC_VERSION_TAG}\` (multi-arch: amd64, arm64)" >> $GITHUB_STEP_SUMMARY echo "- \`dynamo-frontend:${NGC_VERSION_TAG}\` (multi-arch)" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-frontend:${NGC_VERSION_TAG}-amd64\`" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-frontend:${NGC_VERSION_TAG}-arm64\`" >> $GITHUB_STEP_SUMMARY echo "Helm chart:" >> $GITHUB_STEP_SUMMARY
echo "- \`dynamo-platform:${HELM_CHART_VERSION}\` (pushed to NGC Helm registry)" >> $GITHUB_STEP_SUMMARY
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment