# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Nightly CI pipeline on: schedule: - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC) workflow_dispatch: # Allow manual triggering for testing permissions: contents: read defaults: run: shell: bash --noprofile --norc -eo pipefail {0} env: REGISTRY_IMAGE: ai-dynamo/dynamo NIGHTLY_IMAGE_PREFIX: nightly ############################## BUILD JOBS ############################## jobs: build-amd64: name: Build ${{ matrix.framework }} (amd64) runs-on: cpu-amd-m5-4xlarge timeout-minutes: 120 strategy: fail-fast: false matrix: framework: [vllm, trtllm, sglang] env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com steps: - uses: actions/checkout@v4 - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull existing images for cache shell: bash continue-on-error: true run: | echo "Attempting to pull existing images for layer caching..." docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64" || echo "Framework image not found in cache" docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64" || echo "Runtime image not found in cache" echo "Cache pull completed" - name: Build Framework Image id: build_framework uses: ./.github/actions/docker-build with: framework: ${{ matrix.framework }} target: framework platform: linux/amd64 base_image_tag: '' runtime_image_tag: '' cuda_version: '' torch_backend: '' ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} image_tag: framework-${{ matrix.framework }}-amd64:${{ github.run_id }} - name: Tag and Push Framework Images uses: ./.github/actions/docker-tag-push with: local_image: framework-${{ matrix.framework }}-amd64:${{ github.run_id }} push_tags: | ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64 ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-amd64-run-${{ github.run_id }} aws_push: 'true' azure_push: 'false' aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} - name: Build Runtime Image id: build_runtime uses: ./.github/actions/docker-build with: framework: ${{ matrix.framework }} target: runtime platform: linux/amd64 base_image_tag: '' runtime_image_tag: '' cuda_version: '' torch_backend: '' ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} image_tag: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }} - name: Tag and Push Runtime Images uses: ./.github/actions/docker-tag-push with: local_image: runtime-${{ matrix.framework }}-amd64:${{ github.run_id }} push_tags: | ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64 ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-amd64-run-${{ github.run_id }} aws_push: 'true' azure_push: 'true' aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} build-arm64: name: Build ${{ matrix.framework }} (arm64) runs-on: cpu-arm-r8g-4xlarge timeout-minutes: 120 strategy: fail-fast: false matrix: include: - framework: vllm base_image_tag: '25.06-cuda12.9-devel-ubuntu24.04' runtime_image_tag: '12.9.0-runtime-ubuntu24.04' cuda_version: '12.9' torch_backend: 'cu129' - framework: trtllm base_image_tag: '25.06-py3' runtime_image_tag: '' cuda_version: '12.9' torch_backend: 'cu129' - framework: sglang base_image_tag: '' runtime_image_tag: '' cuda_version: '' torch_backend: '' env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com steps: - uses: actions/checkout@v4 - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Pull existing images for cache shell: bash continue-on-error: true run: | echo "Attempting to pull existing images for layer caching..." docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64" || echo "Framework image not found in cache" docker pull "${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64" || echo "Runtime image not found in cache" echo "Cache pull completed" - name: Build Framework Image id: build_framework uses: ./.github/actions/docker-build with: framework: ${{ matrix.framework }} target: framework platform: linux/arm64 base_image_tag: ${{ matrix.base_image_tag }} runtime_image_tag: ${{ matrix.runtime_image_tag }} cuda_version: ${{ matrix.cuda_version }} torch_backend: ${{ matrix.torch_backend }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} image_tag: framework-${{ matrix.framework }}-arm64:${{ github.run_id }} - name: Tag and Push Framework Images uses: ./.github/actions/docker-tag-push with: local_image: framework-${{ matrix.framework }}-arm64:${{ github.run_id }} push_tags: | ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64 ${{ env.REGISTRY_IMAGE }}:main-${{ matrix.framework }}-framework-arm64-run-${{ github.run_id }} aws_push: 'true' azure_push: 'false' aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} - name: Build Runtime Image id: build_runtime uses: ./.github/actions/docker-build with: framework: ${{ matrix.framework }} target: runtime platform: linux/arm64 base_image_tag: ${{ matrix.base_image_tag }} runtime_image_tag: ${{ matrix.runtime_image_tag }} cuda_version: ${{ matrix.cuda_version }} torch_backend: ${{ matrix.torch_backend }} ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} ci_token: ${{ secrets.CI_TOKEN }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} image_tag: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }} - name: Tag and Push Runtime Images uses: ./.github/actions/docker-tag-push with: local_image: runtime-${{ matrix.framework }}-arm64:${{ github.run_id }} push_tags: | ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64 ${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-arm64-run-${{ github.run_id }} aws_push: 'true' azure_push: 'true' aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} ############################## TEST JOBS ############################## unit-tests: name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-unit needs: [build-amd64, build-arm64] if: always() runs-on: ${{ matrix.arch.runner }} timeout-minutes: 45 strategy: fail-fast: false matrix: framework: [vllm, trtllm, sglang] arch: - arch: amd64 runner: gpu-l40-amd64 - arch: arm64 runner: cpu-arm-r8g-4xlarge steps: - uses: actions/checkout@v4 - name: Check if build succeeded id: check_build env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})" # Determine which build job to check if [ "${{ matrix.arch.arch }}" = "amd64" ]; then BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)" else BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)" fi # Query GitHub API for job status using curl (token from env to avoid log exposure) JOBS=$(curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) if [ $? -ne 0 ]; then echo "Error: Failed to query GitHub API" exit 1 fi # Find the specific build job and check its conclusion BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" if [ "$BUILD_STATUS" != "success" ]; then echo "Build failed or did not complete successfully. Failing tests." exit 1 fi echo "Build succeeded. Proceeding with tests." - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} - name: Pull nightly image shell: bash env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} run: | docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} - name: Run Unit Tests uses: ./.github/actions/pytest with: image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} pytest_marks: "unit and (nightly or post_merge or pre_merge)" framework: ${{ matrix.framework }} test_type: unit platform_arch: ${{ matrix.arch.arch }} cpu_limit: '8' dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }} integration-tests: name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-integ needs: [build-amd64, build-arm64] if: always() runs-on: ${{ matrix.arch.runner }} timeout-minutes: ${{ matrix.arch.timeout }} strategy: fail-fast: false matrix: framework: [vllm, trtllm, sglang] arch: - arch: amd64 runner: gpu-l40-amd64 timeout: 90 - arch: arm64 runner: cpu-arm-r8g-4xlarge timeout: 90 steps: - uses: actions/checkout@v4 - name: Check if build succeeded id: check_build env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})" BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})" JOBS=$(curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) if [ $? -ne 0 ]; then echo "Error: Failed to query GitHub API" exit 1 fi BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" if [ "$BUILD_STATUS" != "success" ]; then echo "Build failed or did not complete successfully. Marking tests as failed." exit 1 fi echo "Build succeeded. Proceeding with tests." - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} - name: Pull nightly image shell: bash env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} run: | docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} - name: Run Integration Tests uses: ./.github/actions/pytest with: image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} pytest_marks: "integration and (nightly or post_merge or pre_merge)" framework: ${{ matrix.framework }} test_type: integration platform_arch: ${{ matrix.arch.arch }} dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }} e2e-single-gpu-tests: name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-1gpu-e2e needs: [build-amd64, build-arm64] if: always() runs-on: ${{ matrix.arch.runner }} timeout-minutes: ${{ matrix.arch.timeout }} strategy: fail-fast: false matrix: framework: [vllm, trtllm, sglang] arch: - arch: amd64 runner: gpu-l40-amd64 timeout: 120 - arch: arm64 runner: cpu-arm-r8g-4xlarge timeout: 120 steps: - uses: actions/checkout@v4 - name: Check if build succeeded id: check_build env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})" BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})" JOBS=$(curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) if [ $? -ne 0 ]; then echo "Error: Failed to query GitHub API" echo "skip=true" >> $GITHUB_OUTPUT exit 0 fi BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" if [ "$BUILD_STATUS" != "success" ]; then echo "Build failed or did not complete successfully. Failing tests." exit 1 fi echo "Build succeeded. Proceeding with tests." - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} - name: Pull nightly image shell: bash env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} run: | docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} - name: Run E2E Tests (gpu_1) uses: ./.github/actions/pytest with: image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} pytest_marks: "${{ matrix.framework }} and e2e and gpu_1" framework: ${{ matrix.framework }} test_type: e2e-single-gpu platform_arch: ${{ matrix.arch.arch }} dry_run: ${{ matrix.arch.arch == 'arm64' && 'true' || 'false' }} e2e-multi-gpu-tests: name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-2gpu-e2e needs: [build-amd64, build-arm64] if: always() runs-on: ${{ matrix.arch.runner }} timeout-minutes: ${{ matrix.arch.timeout }} strategy: fail-fast: false matrix: framework: [vllm, trtllm, sglang] arch: - arch: amd64 runner: gpu-l40-amd64 timeout: 150 - arch: arm64 runner: cpu-arm-r8g-4xlarge timeout: 150 steps: - uses: actions/checkout@v4 - name: Check if build succeeded id: check_build env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})" BUILD_JOB_NAME="Build ${{ matrix.framework }} (${{ matrix.arch.arch }})" JOBS=$(curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) if [ $? -ne 0 ]; then echo "Error: Failed to query GitHub API" echo "skip=true" >> $GITHUB_OUTPUT exit 0 fi BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" if [ "$BUILD_STATUS" != "success" ]; then echo "Build failed or did not complete successfully. Marking tests as failed." exit 1 fi echo "Build succeeded. Proceeding with tests." - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} - name: Pull nightly image shell: bash env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} run: | docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} - name: Run E2E Tests (gpu_2) uses: ./.github/actions/pytest with: image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} pytest_marks: "(nightly or post_merge or pre_merge) and e2e and gpu_2" framework: ${{ matrix.framework }} test_type: e2e-multi-gpu platform_arch: ${{ matrix.arch.arch }} dry_run: 'true' # component-tests: # name: ${{ matrix.framework }}-${{ matrix.arch.arch }}-${{ matrix.component }} # needs: [build-amd64, build-arm64] # if: always() # runs-on: ${{ matrix.arch.runner }} # timeout-minutes: ${{ matrix.arch.timeout }} # strategy: # fail-fast: false # matrix: # framework: [vllm, trtllm, sglang] # arch: # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: router # marks: "nightly and router" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: planner # marks: "nightly and planner" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: router # marks: "nightly and router" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: planner # marks: "nightly and planner" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: router # marks: "nightly and router" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: planner # marks: "nightly and planner" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: router # marks: "nightly and router" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: planner # marks: "nightly and planner" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: router # marks: "nightly and router" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 90 # component: planner # marks: "nightly and planner" # - arch: amd64 # runner: gpu-l40-amd64 # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: router # marks: "nightly and router" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 60 # component: planner # marks: "nightly and planner" # - arch: arm64 # runner: cpu-arm-r8g-4xlarge # timeout: 150 # component: kvbm # marks: "nightly and (kvbm or kvbm_v2)" # steps: # - uses: actions/checkout@v4 # - name: Check if build succeeded # id: check_build # env: # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # run: | # set +x # echo "Checking build status for ${{ matrix.framework }} (${{ matrix.arch.arch }})" # if [ "${{ matrix.arch.arch }}" = "amd64" ]; then # BUILD_JOB_NAME="Build ${{ matrix.framework }} (amd64)" # else # BUILD_JOB_NAME="Build ${{ matrix.framework }} (arm64)" # fi # JOBS=$(curl -s -S -L --fail-with-body \ # -H "Authorization: Bearer ${GITHUB_TOKEN}" \ # -H "Accept: application/vnd.github.v3+json" \ # "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) # if [ $? -ne 0 ]; then # echo "Error: Failed to query GitHub API" # echo "skip=true" >> $GITHUB_OUTPUT # exit 0 # fi # BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') # echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" # if [ "$BUILD_STATUS" != "success" ]; then # echo "Build failed or did not complete successfully. Marking tests as failed." # exit 1 # fi # echo "Build succeeded. Proceeding with tests." # - name: Login to Container Registries # uses: ./.github/actions/docker-login # with: # aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} # aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} # - name: Pull nightly image # shell: bash # env: # ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com # IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} # run: | # docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} # docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} # - name: Run Component Tests (${{ matrix.component }}) # uses: ./.github/actions/pytest # with: # image_tag: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework }}-${{ matrix.arch.arch }} # pytest_marks: "${{ matrix.marks }}" # framework: ${{ matrix.framework }} # test_type: component-${{ matrix.component }} # platform_arch: ${{ matrix.arch.arch }} fault-tolerance-tests: name: ${{ matrix.framework.name }}-amd64-ft needs: [build-amd64] if: always() runs-on: cpu-amd-m5-2xlarge timeout-minutes: 180 permissions: contents: read strategy: fail-fast: false # Run matrix jobs sequentially to prevent a Helm race condition # Parallel jobs conflict on ClusterRole ownership when installing the chart. # Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm" max-parallel: 1 matrix: framework: - { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod } - { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod } - { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod } env: DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com steps: - uses: actions/checkout@v4 - name: Check if build succeeded id: check_build env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x echo "Checking build status for ${{ matrix.framework.name }} (amd64)" BUILD_JOB_NAME="Build ${{ matrix.framework.name }} (amd64)" JOBS=$(curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" 2>&1) if [ $? -ne 0 ]; then echo "Error: Failed to query GitHub API" exit 1 fi BUILD_STATUS=$(echo "$JOBS" | jq -r --arg job_name "$BUILD_JOB_NAME" '.jobs[] | select(.name == $job_name) | .conclusion') echo "Build status for '$BUILD_JOB_NAME': $BUILD_STATUS" if [ "$BUILD_STATUS" != "success" ]; then echo "Build failed or did not complete successfully. Failing tests." exit 1 fi echo "Build succeeded. Proceeding with tests." - name: Login to Container Registries uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} - name: Pull nightly image shell: bash env: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com IMAGE_TAG: ${{ env.NIGHTLY_IMAGE_PREFIX }}-${{ matrix.framework.name }}-amd64 run: | docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} docker tag ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${IMAGE_TAG} ${IMAGE_TAG} - name: Set namespace run: | export FRAMEWORK=${{ matrix.framework.name }} echo "NAMESPACE=gh-nightly-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV set -x # Setup kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config current-context - name: Deploy Operator run: | set -x export KUBECONFIG=$(pwd)/.kubeconfig # Create a namespace for this job echo "Creating an ephemeral namespace..." kubectl delete namespace $NAMESPACE || true kubectl create namespace $NAMESPACE || true echo "Attaching the labels for secrets and cleanup" kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true # Set the namespace as default kubectl config set-context --current --namespace=$NAMESPACE # Check if Istio is installed kubectl get pods -n istio-system # Check if default storage class exists kubectl get storageclass # Install Helm chart export VIRTUAL_ENV=/opt/dynamo/venv export KUBE_NS=$NAMESPACE export ISTIO_ENABLED=true export ISTIO_GATEWAY=istio-system/ingress-alb export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX} # Install dynamo env secrets kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true # Create docker pull secret for operator image kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE} # Pull operator image (using nightly tag for operator too) export ECR_HOSTNAME=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com docker pull ${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-operator-amd64 || echo "Operator image not found, will use SHA-based tag" # Install helm dependencies helm repo add bitnami https://charts.bitnami.com/bitnami cd deploy/cloud/helm/platform/ helm dep build . # Install platform with namespace restriction helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ --set dynamo-operator.namespaceRestriction.enabled=true \ --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \ --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \ --timeout 10m --wait # Wait for all deployments to be ready timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch cd - export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE - name: Run Fault Tolerance Tests id: run-ft-tests run: | set -x export KUBECONFIG=$(pwd)/.kubeconfig export NAMESPACE=$NAMESPACE export FRAMEWORK=${{ matrix.framework.name }} export ECR_HOSTNAME=${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com export IMAGE="${ECR_HOSTNAME}/${{ env.REGISTRY_IMAGE }}:${{ env.NIGHTLY_IMAGE_PREFIX }}-${FRAMEWORK}-amd64" echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}" echo "Using namespace: $NAMESPACE" echo "Using image: $IMAGE" # Install python3-venv package if not already installed sudo apt-get update && sudo apt-get install -y python3-venv # Set up Python virtual environment and install test dependencies python3 -m venv venv source venv/bin/activate pip install --upgrade pip pip install -r container/deps/requirements.test.txt pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic # Create test-results directory mkdir -p test-results # Run the pytest command with JUnit XML output set +e # Don't exit on test failures pytest tests/fault_tolerance/deploy/test_deployment.py \ -m 'k8s and fault_tolerance' \ -k '${{ matrix.framework.test_scenario }}' \ -s -v \ --namespace ${NAMESPACE} \ --image ${IMAGE} \ --client-type legacy \ --junitxml=test-results/pytest_ft_report.xml \ --tb=short TEST_EXIT_CODE=$? echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}" exit ${TEST_EXIT_CODE} continue-on-error: true - name: Process Fault Tolerance Test Results if: always() run: | set -x # Rename JUnit XML with unique naming if it exists if [ -f "test-results/pytest_ft_report.xml" ]; then mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml" echo "✅ JUnit XML report renamed with unique identifier" else echo "⚠️ JUnit XML report not found" fi - name: Upload Fault Tolerance Test Results uses: actions/upload-artifact@v4 if: always() with: name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }} path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml retention-days: 7 - name: Cleanup if: always() timeout-minutes: 5 run: | echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" # For debugging purposes, list all the resources before we uninstall kubectl get all echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..." kubectl delete dynamographdeployments --all -n $NAMESPACE || true # Uninstall the helm chart helm ls helm uninstall dynamo-platform || true echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..." kubectl delete namespace $NAMESPACE || true echo "Namespace $NAMESPACE completed." ############################## RESULTS SUMMARY ############################## results-summary: name: Results Summary runs-on: ubuntu-latest if: always() needs: [build-amd64, build-arm64, unit-tests, integration-tests, e2e-single-gpu-tests, e2e-multi-gpu-tests, fault-tolerance-tests] steps: - name: Checkout code uses: actions/checkout@v4 - name: Gather job metadata id: gather env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set +x -e echo "# Nightly CI Results Summary" > results.md echo "" >> results.md echo "| Stage | Status | Runner | Duration (min) | Artifacts |" >> results.md echo "|-------|--------|--------|----------------|-----------|" >> results.md curl -s -S -L --fail-with-body \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \ 2>/dev/null | jq -c '.jobs[] | {id, name, runner_name, conclusion, started_at, completed_at}' > jobs.jsonl while read job_entry; do job_id=$(echo "$job_entry" | jq -r '.id') name=$(echo "$job_entry" | jq -r '.name') runner=$(echo "$job_entry" | jq -r '.runner_name') status=$(echo "$job_entry" | jq -r '.conclusion') started=$(echo "$job_entry" | jq -r '.started_at') completed=$(echo "$job_entry" | jq -r '.completed_at') minutes="N/A" if [[ "$started" != "null" && "$completed" != "null" ]]; then start_epoch=$(date -d "$started" +%s) end_epoch=$(date -d "$completed" +%s) minutes=$(( (end_epoch - start_epoch)/60 )) fi artifact_link="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}#job-$job_id" printf "| %s | %s | %s | %s | [Log & Artifacts](%s) |\n" "$name" "$status" "$runner" "$minutes" "$artifact_link" >> results.md done < jobs.jsonl echo "" >> results.md echo "---" >> results.md - name: Display workflow summary run: cat results.md - name: Upload results summary as job summary run: cat results.md >> $GITHUB_STEP_SUMMARY - name: Upload results as artifact for Slack uses: actions/upload-artifact@v4 if: always() with: name: nightly-results-summary path: results.md retention-days: 7 ############################## SLACK NOTIFICATION ############################## notify-slack: name: Notify Slack runs-on: cpu-amd-m5-4xlarge if: always() && github.event_name == 'schedule' && !github.event.repository.fork needs: results-summary permissions: contents: read env: HAS_SLACK_WEBHOOK: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL != '' }} steps: - name: Send Slack notification if: env.HAS_SLACK_WEBHOOK == 'true' continue-on-error: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | set -euo pipefail JOBS_JSON=$(mktemp) trap 'rm -f "$JOBS_JSON"' EXIT if ! curl -sSL \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \ > "$JOBS_JSON"; then echo "Error: Failed to fetch job data from GitHub API" exit 1 fi if [ ! -s "$JOBS_JSON" ]; then echo "Error: No job data received" exit 1 fi TOTAL_JOBS=$(jq '[.jobs[]] | length' "$JOBS_JSON") SUCCESS_COUNT=$(jq '[.jobs[] | select(.conclusion == "success")] | length' "$JOBS_JSON") FAILED_COUNT=$(jq '[.jobs[] | select(.conclusion == "failure")] | length' "$JOBS_JSON") if [ "$FAILED_COUNT" -eq 0 ]; then STATUS="Success ✅" STATUS_EMOJI=":white_check_mark:" else STATUS="Failed ❌" STATUS_EMOJI=":x:" fi # Main message with summary SUMMARY_TEXT="*Nightly CI Pipeline - ${STATUS}*"$'\n'"Summary: ${SUCCESS_COUNT}/${TOTAL_JOBS} jobs passed"$'\n'"<${RUN_URL}|View Workflow Summary>" if [ "$FAILED_COUNT" -eq 0 ]; then # Success - simple message PAYLOAD=$(jq -n \ --arg text "$SUMMARY_TEXT" \ '{text: $text}') else # Failed - message with blocks FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | "• " + .name' "$JOBS_JSON") FAILED_JOBS_TEXT="*Failed Jobs (${FAILED_COUNT}):*"$'\n'"${FAILED_JOBS}" PAYLOAD=$(jq -n \ --arg summary "$SUMMARY_TEXT" \ --arg failed "$FAILED_JOBS_TEXT" \ '{ text: $summary, blocks: [ { type: "section", text: { type: "mrkdwn", text: $summary } }, { type: "section", text: { type: "mrkdwn", text: $failed } } ] }') fi if curl -sSf -X POST -H "Content-Type: application/json" -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"; then echo "Slack notification sent successfully" else echo "Warning: Failed to send Slack notification" exit 1 fi