# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: PR on: push: branches: - main - "pull-request/[0-9]+" # Note: release/* branches are handled by release.yml workflow workflow_dispatch: inputs: run_deploy_operator: description: 'Run deploy operator and deployment tests' required: false type: boolean default: false concurrency: # The group name is a ternary operation. If the ref_name is 'main', # then the group name uses the run_id to ensure a unique group for # 'main' pushes. Otherwise, the group name is the ref_name, so that # workflows on the same PR/branch have the same group name for cancelling. group: docker-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} env: BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }} jobs: # ============================================================================ # SETUP & DETECTION JOBS # ============================================================================ changed-files: runs-on: ubuntu-latest environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }} outputs: core: ${{ steps.changes.outputs.core }} operator: ${{ steps.changes.outputs.operator }} deploy: ${{ steps.changes.outputs.deploy }} vllm: ${{ steps.changes.outputs.vllm }} sglang: ${{ steps.changes.outputs.sglang }} trtllm: ${{ steps.changes.outputs.trtllm }} builder_name: ${{ steps.export-builder-name.outputs.builder_name }} steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 with: fetch-depth: 0 - name: Check for changes id: changes uses: ./.github/actions/changed-files with: gh_token: ${{ github.token }} - name: Export builder name id: export-builder-name run: | echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT backend-status-check: runs-on: ubuntu-latest needs: [changed-files, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator] # THIS list determines blocking jobs if: always() steps: - name: "Check all dependent jobs" run: | echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' deploy-status-check: runs-on: ubuntu-latest needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] if: always() steps: - name: "Check all deploy test jobs" run: | echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' # ============================================================================ # Operator # ============================================================================ operator: needs: changed-files if: needs.changed-files.outputs.operator == 'true' name: Operator runs-on: prod-default-v2 env: IMAGE_REGISTRY: ai-dynamo IMAGE_REPOSITORY: dynamo ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com outputs: operator_default_tag: ${{ steps.build-and-push-image.outputs.operator_default_tag }} steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Initialize Dynamo Builder uses: ./.github/actions/init-dynamo-builder with: builder_name: ${{ needs.changed-files.outputs.builder_name }} flavor: general all_arch: 'true' - name: Docker Login uses: ./.github/actions/docker-login with: aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} azure_acr_user: ${{ secrets.AZURE_ACR_USER }} azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} - name: Linter shell: bash working-directory: ./deploy/operator run: | docker buildx build --platform linux/arm64 --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . - name: Tester shell: bash working-directory: ./deploy/operator run: | docker buildx build --platform linux/arm64 --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ . - name: Set up Go uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0 with: go-version: '1.25' - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Python dependencies for operator codegen shell: bash working-directory: ./deploy/operator run: | python -m pip install --upgrade pip python -m pip install "pydantic>=2,<3" "black==23.1.0" "pyyaml>=6.0" - name: Check for uncommitted changes shell: bash working-directory: ./deploy/operator run: | make check - name: Build and push Container id: build-and-push-image shell: bash working-directory: ./deploy/operator env: NO_CACHE_FLAG: '' # placeholder for future logic to add no cache flag if needed run: | ECR_DEFAULT_IMAGE_BASE="${ECR_HOSTNAME}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" DEFAULT_TAG="${{ github.sha }}-operator" ACR_IMAGE_BASE="${{ secrets.AZURE_ACR_HOSTNAME }}/${IMAGE_REGISTRY}/${IMAGE_REPOSITORY}" IMAGE_URIS=( "${ECR_DEFAULT_IMAGE_BASE}:${DEFAULT_TAG}" "${ACR_IMAGE_BASE}:${DEFAULT_TAG}" ) if [[ "${{ github.ref_name }}" == "main" ]]; then IMAGE_URIS+=( "${ECR_DEFAULT_IMAGE_BASE}:main-operator" "${ACR_IMAGE_BASE}:main-operator" ) fi echo "operator_default_tag=${DEFAULT_TAG}" >> $GITHUB_OUTPUT TAGGING_FLAGS=$(printf -- '-t %s ' "${IMAGE_URIS[@]}") echo "flags for docker buildx: ${TAGGING_FLAGS}" if [[ "$NO_CACHE_FLAG" == "true" ]]; then NO_CACHE_FLAG="--no-cache" fi docker buildx build --push ${NO_CACHE_FLAG} \ --platform linux/amd64,linux/arm64 \ --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ ${TAGGING_FLAGS} -f Dockerfile . echo "### 🐳 Operator Container Images" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Image URI |" >> $GITHUB_STEP_SUMMARY echo "|-----|" >> $GITHUB_STEP_SUMMARY for image_uri in "${IMAGE_URIS[@]}"; do echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY done # ============================================================================ # FRAMEWORK PIPELINES (Build → Test → Copy) # ============================================================================ # ============================================================================ # VLLM PIPELINE # ============================================================================ vllm-pipeline: needs: [changed-files] if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: vllm target: runtime platforms: '["amd64", "arm64"]' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} builder_name: ${{ needs.changed-files.outputs.builder_name }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} cpu_only_test_markers: 'pre_merge and vllm and gpu_0' single_gpu_test_markers: 'pre_merge and vllm and gpu_1' single_gpu_test_timeout_minutes: 35 run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved secrets: inherit # ============================================================================ # SGLANG PIPELINE # ============================================================================ sglang-pipeline: needs: [changed-files] if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: sglang target: runtime platforms: '["amd64", "arm64"]' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} builder_name: ${{ needs.changed-files.outputs.builder_name }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} cpu_only_test_markers: 'pre_merge and sglang and gpu_0' single_gpu_test_markers: 'pre_merge and sglang and gpu_1' run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved secrets: inherit # ============================================================================ # TRTLLM PIPELINE # ============================================================================ trtllm-pipeline: needs: [changed-files] if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml with: framework: trtllm target: runtime platforms: '["amd64", "arm64"]' cuda_versions: '["13.1"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} builder_name: ${{ needs.changed-files.outputs.builder_name }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} cpu_only_test_markers: 'pre_merge and trtllm and gpu_0' single_gpu_test_markers: 'pre_merge and trtllm and gpu_1' run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved secrets: inherit # ============================================================================ # DEPLOYMENT JOBS # Deploy operator and run end-to-end tests on Kubernetes cluster # ============================================================================ deploy-operator: runs-on: prod-default-small-v2 # Run when any deploy test will run: if core, any framework, or deploy files changed if: | always() && (needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true') && (needs.operator.result == 'success' || needs.operator.result == 'skipped') needs: [changed-files, operator] outputs: NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }} steps: - uses: actions/checkout@v4 - name: Determine operator image tag id: operator-tag run: | if [ "${{ needs.operator.result }}" == "success" ]; then echo "tag=${{ needs.operator.outputs.operator_default_tag }}" >> $GITHUB_OUTPUT echo "Using newly built operator image: ${{ needs.operator.outputs.operator_default_tag }}" else echo "tag=main-operator" >> $GITHUB_OUTPUT echo "Using stable operator image: main-operator" fi - name: Deploy Operator id: deploy-operator-step env: BRANCH: ${{ github.ref_name }} run: | set -x # Set namespace # Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/ BRANCH_SANITIZED="${BRANCH//\//-}" BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}" BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}" # Cap at 10 chars BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}" NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt" echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT" # Setup kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config current-context # Create a namespace for this job echo "Creating an ephemeral namespace..." kubectl create namespace $NAMESPACE echo "Attaching the labels for secrets and cleanup" kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true # Set the namespace as default kubectl config set-context --current --namespace=$NAMESPACE # Check if Istio is installed kubectl get pods -n istio-system # Check if default storage class exists kubectl get storageclass # Install Helm chart export VIRTUAL_ENV=/opt/dynamo/venv export KUBE_NS=$NAMESPACE export ISTIO_ENABLED=true export ISTIO_GATEWAY=istio-system/ingress-alb export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true # Install dynamo env secrets kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true # Install helm dependencies helm repo add bitnami https://charts.bitnami.com/bitnami cd deploy/helm/charts/platform/ helm dep build . # Install platform with namespace restriction for single profile testing # we manage crds via Velonix so we skip the crds installation helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ --skip-crds \ --set dynamo-operator.namespaceRestriction.enabled=true \ --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ --set dynamo-operator.controllerManager.manager.image.tag=${{ steps.operator-tag.outputs.tag }} \ --set dynamo-operator.gpuDiscovery.enabled=false \ --set dynamo-operator.upgradeCRD=false \ --debug # Wait for all deployments to be ready timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch - name: 🔍 Report Unhealthy Pods if: failure() run: | # Setup kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config current-context # Descriptive header for the summary echo "### ⚠️ OPERATOR DEPLOYMENT FAILED: Unhealthy Pods Report" >> $GITHUB_STEP_SUMMARY echo "Unhealthy pods:" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY # Get pods, exclude healthy ones, and format output # If the namespace is empty or all pods are healthy, the grep/awk won't output anything, which is handled gracefully. kubectl get pods -n ${{ steps.deploy-operator-step.outputs.namespace }} --no-headers \ | grep -v -E '(Running|Completed)' \ | awk '{print "- 🔴 **" $1 "** | Status: `" $3 "`"}' >> $GITHUB_STEP_SUMMARY || true # ============================================================================ # # End-to-end tests for each framework with various deployment profiles # ============================================================================ deploy-test-vllm: # Run if core, vllm, or deploy is changed if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true' runs-on: prod-default-small-v2 needs: [changed-files, deploy-operator, vllm-pipeline] timeout-minutes: 25 permissions: contents: read strategy: fail-fast: false max-parallel: 2 matrix: profile: - agg - agg_router - disagg - disagg_router name: deploy-test-vllm (${{ matrix.profile }}) env: FRAMEWORK: vllm steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Run Dynamo Deploy Test id: deploy-test uses: ./.github/actions/dynamo-deploy-test with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} deployment_file: "deploy/${{ matrix.profile }}.yaml" framework: ${{ env.FRAMEWORK }} profile: ${{ matrix.profile }} image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-amd64 platform_arch: amd64 deploy-test-sglang: runs-on: prod-default-small-v2 # Run if core, sglang, or deploy is changed if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true' needs: [changed-files, deploy-operator, sglang-pipeline] timeout-minutes: 25 permissions: contents: read strategy: fail-fast: false max-parallel: 2 matrix: profile: - agg - agg_router name: deploy-test-sglang (${{ matrix.profile }}) env: FRAMEWORK: sglang steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Run Dynamo Deploy Test id: deploy-test uses: ./.github/actions/dynamo-deploy-test with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} deployment_file: "deploy/${{ matrix.profile }}.yaml" framework: ${{ env.FRAMEWORK }} profile: ${{ matrix.profile }} image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-amd64 platform_arch: amd64 deploy-test-trtllm: runs-on: prod-default-small-v2 # Run if core, trtllm, or deploy is changed if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true' needs: [changed-files, deploy-operator, trtllm-pipeline] timeout-minutes: 25 permissions: contents: read strategy: fail-fast: false max-parallel: 2 matrix: profile: - agg - agg_router # Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate. # Re-enable once the underlying disagg deployment issue is resolved. # - disagg # - disagg_router name: deploy-test-trtllm (${{ matrix.profile }}) env: FRAMEWORK: trtllm steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Run Dynamo Deploy Test id: deploy-test uses: ./.github/actions/dynamo-deploy-test with: kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }} deployment_file: "deploy/${{ matrix.profile }}.yaml" framework: ${{ env.FRAMEWORK }} profile: ${{ matrix.profile }} image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-amd64 platform_arch: amd64 # ============================================================================ # CLEANUP JOBS # Clean up ephemeral Kubernetes namespace and resources # ============================================================================ clean-k8s-builder: name: Clean K8s builder if exists runs-on: prod-default-small-v2 if: always() needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator, changed-files] steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Create K8s builders (skip bootstrap) uses: ./.github/actions/bootstrap-buildkit continue-on-error: true with: builder_name: ${{ needs.changed-files.outputs.builder_name }} buildkit_worker_addresses: '' # k8s builder skip_bootstrap: true - name: Builder Cleanup in case of k8s builder shell: bash run: | docker buildx rm ${{ needs.changed-files.outputs.builder_name }} || true cleanup: name: Cleanup AKS resources runs-on: prod-default-small-v2 if: always() needs: [deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm] steps: - name: Checkout code uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 - name: Setup Kubeconfig env: NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }} run: | set -x # Setup kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config current-context - name: Cleanup timeout-minutes: 5 env: NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }} run: | set -x export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig chmod 600 .kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" # For debugging purposes, list all the resources before we uninstall kubectl get dynamographdeployments kubectl get all echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..." kubectl delete dynamographdeployments --all -n $NAMESPACE || true # Uninstall the helm chart helm ls helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m || true echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..." kubectl delete namespace $NAMESPACE || true echo "Namespace $NAMESPACE completed."