# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Post-Merge CI Pipeline on: push: branches: - main - 'release/*.*.*' permissions: contents: read jobs: # ============================================================================ # FRAMEWORK PIPELINES (Build → Test → Copy) # ============================================================================ # ============================================================================ # VLLM PIPELINE # ============================================================================ vllm-pipeline: name: vllm uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: vllm target: runtime platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 120 copy_timeout_minutes: 20 cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0' single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1' multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)' cpu_only_test_timeout_minutes: 60 single_gpu_test_timeout_minutes: 60 multi_gpu_test_timeout_minutes: 60 secrets: inherit # ============================================================================ # SGLANG PIPELINE # ============================================================================ sglang-pipeline: name: sglang uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: sglang target: runtime platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 120 copy_timeout_minutes: 20 cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0' single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1' multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)' cpu_only_test_timeout_minutes: 60 single_gpu_test_timeout_minutes: 60 multi_gpu_test_timeout_minutes: 60 secrets: inherit # ============================================================================ # TRTLLM PIPELINE # ============================================================================ trtllm-pipeline: name: trtllm uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: trtllm target: runtime platform: 'linux/amd64,linux/arm64' cuda_versions: '["13.1"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 120 copy_timeout_minutes: 20 cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0' single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1' multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)' cpu_only_test_timeout_minutes: 60 single_gpu_test_timeout_minutes: 60 multi_gpu_test_timeout_minutes: 60 secrets: inherit # ============================================================================ # DEV PIPELINES # ============================================================================ vllm-dev-pipeline: name: vllm-dev uses: ./.github/workflows/build-flavor.yml with: framework: vllm target: dev platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 60 push_image: true run_compliance_scan: true copy_to_acr: true secrets: inherit sglang-dev-pipeline: name: sglang-dev uses: ./.github/workflows/build-flavor.yml with: framework: sglang target: dev platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 60 push_image: true run_compliance_scan: true copy_to_acr: true secrets: inherit trtllm-dev-pipeline: name: trtllm-dev uses: ./.github/workflows/build-flavor.yml with: framework: trtllm target: dev platform: 'linux/amd64,linux/arm64' cuda_versions: '["13.1"]' builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 60 push_image: true run_compliance_scan: true copy_to_acr: true secrets: inherit # ============================================================================ # EFA PIPELINES (Build only, amd64) # ============================================================================ # ============================================================================ # VLLM EFA PIPELINE # ============================================================================ vllm-efa-pipeline: uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: vllm target: runtime platform: 'linux/amd64' cuda_versions: '["12.9"]' make_efa: true extra_tags: | ${{ github.ref_name == 'main' && 'main-vllm-efa' || '' }} ${{ github.ref_name == 'main' && format('main-vllm-efa-{0}', github.sha) || '' }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 120 copy_timeout_minutes: 20 cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0' cpu_only_test_timeout_minutes: 60 run_single_gpu_tests: false run_multi_gpu_tests: false copy_to_acr: false secrets: inherit # ============================================================================ # TRTLLM EFA PIPELINE # ============================================================================ trtllm-efa-pipeline: uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: trtllm target: runtime platform: 'linux/amd64' cuda_versions: '["13.1"]' make_efa: true extra_tags: | ${{ github.ref_name == 'main' && 'main-trtllm-efa' || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-efa-{0}', github.sha) || '' }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} build_timeout_minutes: 120 copy_timeout_minutes: 20 cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0' cpu_only_test_timeout_minutes: 60 run_single_gpu_tests: false run_multi_gpu_tests: false copy_to_acr: false secrets: inherit # ============================================================================ # FRONTEND IMAGE BUILD # ============================================================================ frontend-image: name: Frontend Image uses: ./.github/workflows/build-frontend-image.yaml with: skip_change_detection: true secrets: inherit # ============================================================================ # Operator # ============================================================================ operator: name: Operator runs-on: prod-default-v2 env: IMAGE_REGISTRY: ai-dynamo IMAGE_REPOSITORY: dynamo ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com outputs: operator_default_tag: ${{ steps.build-and-push-image.outputs.operator_default_tag }} steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Initialize Dynamo Builder uses: ./.github/actions/init-dynamo-builder with: builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} flavor: general arch: 'linux/amd64,linux/arm64' - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Linter shell: bash working-directory: ./deploy/operator run: | docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . - name: Tester shell: bash working-directory: ./deploy/operator run: | docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . - name: Set up Go uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 with: go-version: '1.25' - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Python dependencies for operator codegen shell: bash working-directory: ./deploy/operator run: | python -m pip install --upgrade pip python -m pip install "pydantic>=2,<3" "black==23.1.0" "pyyaml>=6.0" - name: Check for uncommitted changes shell: bash working-directory: ./deploy/operator run: | make check - name: Build and push Container id: build-and-push-image shell: bash working-directory: ./deploy/operator env: NO_CACHE_FLAG: '' # placeholder for future logic to add no cache flag if needed run: | ECR_DEFAULT_IMAGE_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" DEFAULT_TAG="${{ github.sha }}-operator" ACR_IMAGE_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" IMAGE_URIS=( "${ECR_DEFAULT_IMAGE_BASE}:${DEFAULT_TAG}" "${ACR_IMAGE_BASE}:${DEFAULT_TAG}" ) if [[ "${{ github.ref_name }}" == "main" ]]; then IMAGE_URIS+=( "${ECR_DEFAULT_IMAGE_BASE}:main-operator" "${ACR_IMAGE_BASE}:main-operator" ) fi echo "operator_default_tag=${DEFAULT_TAG}" >> $GITHUB_OUTPUT TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}") echo "flags for docker buildx: ${TAGGING_FLAGS}" if [[ "$NO_CACHE_FLAG" == "true" ]]; then NO_CACHE_FLAG="--no-cache" fi docker buildx build --push ${NO_CACHE_FLAG} \ --platform linux/amd64,linux/arm64 \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ ${TAGGING_FLAGS} -f Dockerfile . echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Image URI |" >> $GITHUB_STEP_SUMMARY echo "|-----|" >> $GITHUB_STEP_SUMMARY for image_uri in "${IMAGE_URIS[@]}"; do echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY done # ============================================================================ # DEPLOYMENT JOBS # Deploy operator and run end-to-end tests on Kubernetes cluster # ============================================================================ deploy-operator: needs: [operator] runs-on: prod-default-small-v2 outputs: namespace: ${{ steps.setup.outputs.namespace }} vcluster_name: ${{ steps.setup.outputs.vcluster_name }} operator_tag: ${{ steps.setup.outputs.operator_tag }} steps: - uses: actions/checkout@v4 - name: Setup vCluster and operator id: setup uses: ./.github/actions/setup-dynamo-operator with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} registry: ${{ secrets.AZURE_ACR_HOSTNAME }} operator_tag: ${{ needs.operator.outputs.operator_default_tag }} hf_token: ${{ secrets.HF_TOKEN }} dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }} dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} deploy-test-vllm: needs: [deploy-operator, vllm-pipeline] uses: ./.github/workflows/shared-deploy-test-framework.yml with: framework: vllm profiles: '["agg", "agg_router", "disagg", "disagg_router"]' image_suffix: vllm-runtime-cuda12 namespace: ${{ needs.deploy-operator.outputs.namespace }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }} secrets: inherit deploy-test-sglang: needs: [deploy-operator, sglang-pipeline] uses: ./.github/workflows/shared-deploy-test-framework.yml with: framework: sglang profiles: '["agg", "agg_router"]' image_suffix: sglang-runtime-cuda12 namespace: ${{ needs.deploy-operator.outputs.namespace }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }} secrets: inherit deploy-test-trtllm: needs: [deploy-operator, trtllm-pipeline] uses: ./.github/workflows/shared-deploy-test-framework.yml with: framework: trtllm profiles: '["agg", "agg_router"]' image_suffix: trtllm-runtime-cuda13 namespace: ${{ needs.deploy-operator.outputs.namespace }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }} secrets: inherit deploy-cleanup: if: always() needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie] runs-on: prod-default-small-v2 steps: - uses: actions/checkout@v4 - name: Teardown vCluster if: needs.deploy-operator.outputs.namespace != '' && needs.deploy-operator.outputs.vcluster_name != '' uses: ./.github/actions/teardown-dynamo-operator with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} # ============================================================================ # GAIE DEPLOY TEST # ============================================================================ deploy-test-gaie: name: GAIE Deploy Test runs-on: prod-default-small-v2 needs: [deploy-operator, frontend-image, vllm-pipeline] timeout-minutes: 30 permissions: contents: read steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Check if vCluster exists id: vcluster-check uses: ./.github/actions/check-vcluster-exists with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} - name: Self-bootstrap vCluster (rerun) if: steps.vcluster-check.outputs.exists != 'true' uses: ./.github/actions/setup-dynamo-operator with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} registry: ${{ secrets.AZURE_ACR_HOSTNAME }} operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }} hf_token: ${{ secrets.HF_TOKEN }} dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }} dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} - name: Connect to vCluster id: connect-vcluster uses: ./.github/actions/connect-vcluster with: host_kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} - name: Install GAIE prerequisites (Gateway API, kgateway, Inference Extension CRDs) shell: bash env: KUBECONFIG_B64: ${{ steps.connect-vcluster.outputs.kubeconfig_base64 }} run: | echo "${KUBECONFIG_B64}" | base64 -d > ${{ github.workspace }}/.kubeconfig_gaie export KUBECONFIG=${{ github.workspace }}/.kubeconfig_gaie export NAMESPACE=default bash deploy/inference-gateway/scripts/install_gaie_crd_kgateway.sh rm -f ${{ github.workspace }}/.kubeconfig_gaie - name: Run GAIE Deploy Test id: deploy-test uses: ./.github/actions/dynamo-deploy-test with: kubeconfig_base64: ${{ steps.connect-vcluster.outputs.kubeconfig_base64 }} namespace: default registry: ${{ secrets.AZURE_ACR_HOSTNAME }} operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }} hf_token: ${{ secrets.HF_TOKEN }} test_name: gaie extra_pytest_args: >- -m framework_with_gaie --frontend-image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-frontend --image=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64 deploy-status-check: runs-on: ubuntu-latest needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie] if: always() steps: - name: "Check all deploy test jobs" run: | echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))' # ============================================================================ # CLEANUP JOBS # ============================================================================ clean-k8s-builder: name: Clean K8s builder if exists runs-on: prod-default-small-v2 if: always() needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image] steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Create K8s builders (skip bootstrap) uses: ./.github/actions/bootstrap-buildkit continue-on-error: true with: builder_name: b-${{ github.run_id }}-${{ github.run_attempt }} buildkit_worker_addresses: '' # k8s builder skip_bootstrap: true - name: Builder Cleanup in case of k8s builder shell: bash run: | docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true ############################## SLACK NOTIFICATION ############################## notify-slack: name: Notify Slack runs-on: prod-builder-amd-v1 if: always() && failure() needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, vllm-dev-pipeline, sglang-dev-pipeline, trtllm-dev-pipeline, vllm-efa-pipeline, trtllm-efa-pipeline, operator, frontend-image, deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm, deploy-test-gaie ] permissions: contents: read steps: - name: Get Failed jobs shell: bash env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | JOBS_JSON=$(mktemp) curl -sSL \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \ >$JOBS_JSON FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | .name | split(" / ") | if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end | . + "\\n"' "$JOBS_JSON") echo $FAILED_JOBS { echo "FAILED_JOBS<> "$GITHUB_ENV" - name: Notify Slack uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1 with: webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} webhook-type: incoming-webhook payload: | blocks: - type: "section" text: type: mrkdwn text: ":alert: *Github Post-merge Pipeline Failure*" - type: "section" text: type: mrkdwn text: "" - type: "section" text: type: mrkdwn text: "${{ env.FAILED_JOBS }}" - type: "section" text: type: mrkdwn text: "@ops-support Please investigate the failures above."