# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

name: Docker Build and Test

on:
  push:
    branches:
      - main
      - "pull-request/[0-9]+"
      - release/*.*.*
  workflow_dispatch:
    inputs:
      run_deploy_operator:
        description: 'Run deploy operator and deployment tests'
        required: false
        type: boolean
        default: false

concurrency:
    # The group name is a ternary operation. If the ref_name is 'main',
    # then the group name uses the run_id to ensure a unique group for
    # 'main' pushes. Otherwise, the group name is the ref_name, so that
    # workflows on the same PR/branch have the same group name for cancelling.
    group: docker-build-test-${{ github.ref_name == 'main' && github.run_id || github.ref_name }}
    cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
  changed-files:
    runs-on: ubuntu-latest
    environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }}
    outputs:
      has_code_changes: ${{ steps.filter.outputs.has_code_changes }}
    steps:
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
      - name: Check for changes
        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
        id: filter
        with:
          filters: .github/filters.yaml

  backend-status-check:
    runs-on: ubuntu-latest
    needs: [vllm, sglang, trtllm, operator]
    if: always()
    steps:
      - name: "Check all dependent jobs"
        run: |
          echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'

  operator:
    needs: changed-files
    if: needs.changed-files.outputs.has_code_changes == 'true'
    strategy:
      fail-fast: false
      matrix:
        platform:
          - { arch: amd64, runner: cpu-amd-m5-2xlarge }
          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
    name: operator (${{ matrix.platform.arch }})
    runs-on: ${{ matrix.platform.runner }}
    steps:
      - name: Output Node Name
        shell: bash
        run: |
          echo ${K8S_NODE_NAME}
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
        with:
          driver: docker
      - name: Login to ECR
        uses: ./.github/actions/docker-login
        with:
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
      - name: Linter
        shell: bash
        env:
          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
        run: |
          cd deploy/cloud/operator
          docker build --target linter --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .
      - name: Tester
        shell: bash
        env:
          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
        run: |
          cd deploy/cloud/operator
          docker build --target tester --progress=plain --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ .

      - name: Set up Go
        uses: actions/setup-go@44694675825211faa026b3c33043df3e48a5fa00 # v6.0.0
        with:
          go-version: '1.24'
      - name: Check for uncommitted changes
        shell: bash
        env:
          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
        run: |
          sudo apt-get update && sudo apt-get install -y make
          cd deploy/cloud/operator
          make check
      - name: Build Container
        id: build-image
        shell: bash
        env:
          ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
        run: |
          cd deploy/cloud/operator
          docker buildx build --load \
              --platform linux/${{ matrix.platform.arch }} \
              --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \
              -f Dockerfile \
              -t dynamo-operator:latest .
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: dynamo-operator:latest
          push_tags: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }}
          aws_push: 'false'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

  vllm:
    needs: changed-files
    if: needs.changed-files.outputs.has_code_changes == 'true'
    strategy:
      fail-fast: false
      matrix:
        platform:
          - { arch: amd64, runner: gpu-l40-amd64 }
          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
    name: vllm (${{ matrix.platform.arch }})
    runs-on: ${{ matrix.platform.runner }}
    steps:
      - name: Output Node Name
        shell: bash
        run: |
          echo ${K8S_NODE_NAME}
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
      - name: Build Container
        id: build-image
        uses: ./.github/actions/docker-build
        with:
          framework: vllm
          target: runtime
          platform: 'linux/${{ matrix.platform.arch }}'
          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
          cuda_version: ${{ matrix.platform.arch == 'arm64' && '129' || '' }}
          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
      - name: Login to Container Registries
        uses: ./.github/actions/docker-login
        with:
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
          push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "pre_merge and vllm"
          framework: "vllm"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  sglang:
    needs: changed-files
    if: needs.changed-files.outputs.has_code_changes == 'true'
    strategy:
      fail-fast: false
      matrix:
        platform:
          - { arch: amd64, runner: gpu-l40-amd64 }
          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
    name: sglang (${{ matrix.platform.arch }})
    runs-on: ${{ matrix.platform.runner }}
    steps:
      - name: Output Node Name
        shell: bash
        run: |
          echo ${K8S_NODE_NAME}
      - name: Checkout repository
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0

      - name: Build Container
        id: build-image
        uses: ./.github/actions/docker-build
        with:
          framework: sglang
          target: runtime
          platform: 'linux/${{ matrix.platform.arch }}'
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

      - name: Login to Container Registries
        uses: ./.github/actions/docker-login
        with:
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
          push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "pre_merge and sglang"
          framework: "sglang"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  trtllm:
    needs: changed-files
    if: needs.changed-files.outputs.has_code_changes == 'true'
    strategy:
      fail-fast: false
      matrix:
        platform:
          - { arch: amd64, runner: gpu-l40-amd64 }
          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
    name: trtllm (${{ matrix.platform.arch }})
    runs-on: ${{ matrix.platform.runner }}
    steps:
      - name: Output Node Name
        shell: bash
        run: |
          echo ${K8S_NODE_NAME}
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0

      - name: Build Container
        id: build-image
        uses: ./.github/actions/docker-build
        with:
          framework: trtllm
          target: runtime
          platform: 'linux/${{ matrix.platform.arch }}'
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}

      - name: Login to Container Registries
        uses: ./.github/actions/docker-login
        with:
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
      - name: Docker Tag and Push
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
          push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
          # OPS-1145: Switch aws_push to true
          aws_push: 'false'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
          pytest_marks: "pre_merge and trtllm"
          framework: "trtllm"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  deploy-test-fault-tolerance:
    runs-on: cpu-amd-m5-2xlarge
    if: needs.changed-files.outputs.has_code_changes == 'true'
    needs: [changed-files, operator, vllm, trtllm, sglang]
    permissions:
      contents: read
    strategy:
      fail-fast: false
      # Run matrix jobs sequentially to prevent a Helm race condition
      # Parallel jobs conflict on ClusterRole ownership when installing the chart.
      # Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
      max-parallel: 1
      matrix:
        framework:
          - { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
          - { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
          - { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
    name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
    env:
      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
    steps:
      - name: Output Node Name
        shell: bash
        run: |
          echo ${K8S_NODE_NAME}
      - name: Checkout code
        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
      - name: Set namespace
        run: |
          # Set namespace using test scenario
          export FRAMEWORK=${{ matrix.framework.name }}
          echo "NAMESPACE=gh-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
          set -x

          # Setup kubeconfig
          echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
          chmod 600 .kubeconfig
          export KUBECONFIG=$(pwd)/.kubeconfig
          kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
          kubectl config current-context
      - name: Deploy Operator
        run: |
          set -x
          export KUBECONFIG=$(pwd)/.kubeconfig

          # Create a namespace for this job
          echo "Creating an ephemeral namespace..."
          kubectl delete namespace $NAMESPACE || true
          kubectl create namespace $NAMESPACE || true
          echo "Attaching the labels for secrets and cleanup"
          kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true

          # Set the namespace as default
          kubectl config set-context --current --namespace=$NAMESPACE

          # Check if Istio is installed
          kubectl get pods -n istio-system
          # Check if default storage class exists
          kubectl get storageclass

          # Install Helm chart
          export VIRTUAL_ENV=/opt/dynamo/venv
          export KUBE_NS=$NAMESPACE
          export ISTIO_ENABLED=true
          export ISTIO_GATEWAY=istio-system/ingress-alb
          export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
          export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}

          # Install dynamo env secrets
          kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
          # Create docker pull secret for operator image
          kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
          # Install helm dependencies
          helm repo add bitnami https://charts.bitnami.com/bitnami
          cd deploy/cloud/helm/platform/
          helm dep build .
          # Install platform with namespace restriction for single profile testing
          helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
            --set dynamo-operator.namespaceRestriction.enabled=true \
            --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
            --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
            --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
            --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
            --timeout 10m --wait
          # Wait for all deployments to be ready
          timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
          cd -

          export KUBECONFIG=$(pwd)/.kubeconfig
          kubectl config set-context --current --namespace=$NAMESPACE
      - name: Run Fault Tolerance Tests
        id: run-ft-tests
        run: |
          set -x
          export KUBECONFIG=$(pwd)/.kubeconfig
          export NAMESPACE=$NAMESPACE
          export FRAMEWORK=${{ matrix.framework.name }}
          export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"

          echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
          echo "Using namespace: $NAMESPACE"
          echo "Using image: $IMAGE"

          # Install python3-venv package if not already installed
          sudo apt-get update && sudo apt-get install -y python3-venv

          # Set up Python virtual environment and install test dependencies
          python3 -m venv venv
          source venv/bin/activate
          pip install --upgrade pip
          pip install -r container/deps/requirements.test.txt
          pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic

          # Create test-results directory
          mkdir -p test-results

          # Run the pytest command with JUnit XML output
          set +e  # Don't exit on test failures
          pytest tests/fault_tolerance/deploy/test_deployment.py \
            -m 'k8s and fault_tolerance' \
            -k '${{ matrix.framework.test_scenario }}' \
            -s -v \
            --namespace ${NAMESPACE} \
            --image ${IMAGE} \
            --client-type legacy \
            --junitxml=test-results/pytest_ft_report.xml \
            --tb=short

          TEST_EXIT_CODE=$?
          echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
          echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"

          exit ${TEST_EXIT_CODE}
        continue-on-error: true

      - name: Process Fault Tolerance Test Results
        if: always()
        run: |
          set -x

          # Rename JUnit XML with unique naming if it exists
          if [ -f "test-results/pytest_ft_report.xml" ]; then
            mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
            echo "✅ JUnit XML report renamed with unique identifier"
          else
            echo "⚠️  JUnit XML report not found"
          fi

      - name: Upload Fault Tolerance Test Results
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
          path: test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
          retention-days: 7

      - name: Cleanup
        if: always()
        timeout-minutes: 5
        run: |
          echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
          chmod 600 .kubeconfig
          export KUBECONFIG=$(pwd)/.kubeconfig
          kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"

          # For debugging purposes, list all the resources before we uninstall
          kubectl get all

          echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
          kubectl delete dynamographdeployments --all -n $NAMESPACE || true

          # Uninstall the helm chart
          helm ls
          helm uninstall dynamo-platform || true

          echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
          kubectl delete namespace $NAMESPACE || true
          echo "Namespace $NAMESPACE completed."

  # Upload metrics for this workflow and all its jobs
  upload-workflow-metrics:
    name: Upload Workflow Metrics
    runs-on: gitlab
    if: always()  # Always run, even if other jobs fail
    needs: [backend-status-check]  # Wait for the status check which waits for all build jobs

    steps:
      - name: Check out repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.x'

      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install requests

      - name: Download build metrics
        uses: actions/download-artifact@v4
        with:
          pattern: build-metrics-*
          path: build-metrics/
          merge-multiple: true
        continue-on-error: true  # Don't fail if artifacts don't exist

      - name: Download test results
        uses: actions/download-artifact@v4
        with:
          pattern: test-results-*
          path: test-results/
          merge-multiple: true
        continue-on-error: true  # Don't fail if artifacts don't exist

      - name: Upload Complete Workflow Metrics
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          WORKFLOW_INDEX: ${{ secrets.WORKFLOW_INDEX }}
          JOB_INDEX: ${{ secrets.JOB_INDEX }}
          STEPS_INDEX: ${{ secrets.STEPS_INDEX }}
          # Container and test index configuration
          CONTAINER_INDEX: ${{ secrets.CONTAINER_INDEX }}
          TEST_INDEX: ${{ secrets.TEST_INDEX }}
        run: |
          # Upload complete workflow metrics including container metrics
          python3 .github/workflows/upload_complete_workflow_metrics.py

  deploy-operator:
    runs-on: cpu-amd-m5-2xlarge
    # TODO: Uncomment this when we have a way to test the deploy-operator job in CI.
    #if: needs.changed-files.outputs.has_code_changes == 'true'
    if: github.event.inputs.run_deploy_operator
    needs: [changed-files, operator, vllm, sglang, trtllm]
    env:
      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
    outputs:
      NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
    steps:
    - name: Output Node Name
      shell: bash
      run: |
        echo ${K8S_NODE_NAME}
    - uses: actions/checkout@v4
    - name: Deploy Operator
      id: deploy-operator-step
      env:
        BRANCH: ${{ github.ref_name }}
      run: |
        set -x

        # Set namespace
        # Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
        BRANCH_SANITIZED="${BRANCH//\//-}"
        BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
        BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
        NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
        echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"

        # Setup kubeconfig
        echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
        chmod 600 .kubeconfig
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
        kubectl config current-context

        # Create a namespace for this job
        echo "Creating an ephemeral namespace..."
        kubectl create namespace $NAMESPACE
        echo "Attaching the labels for secrets and cleanup"
        kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true

        # Set the namespace as default
        kubectl config set-context --current --namespace=$NAMESPACE

        # Check if Istio is installed
        kubectl get pods -n istio-system
        # Check if default storage class exists
        kubectl get storageclass

        # Install Helm chart
        export VIRTUAL_ENV=/opt/dynamo/venv
        export KUBE_NS=$NAMESPACE
        export ISTIO_ENABLED=true
        export ISTIO_GATEWAY=istio-system/ingress-alb
        export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
        export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}

        # Install dynamo env secrets
        kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
        # Create docker pull secret for operator image
        kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
        # Install helm dependencies
        helm repo add bitnami https://charts.bitnami.com/bitnami
        cd deploy/cloud/helm/platform/
        helm dep build .
        # Install platform with namespace restriction for single profile testing
        helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
          --set dynamo-operator.namespaceRestriction.enabled=true \
          --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
          --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
          --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
          --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
        # Wait for all deployments to be ready
        timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch

  deploy-test-vllm:
    runs-on: cpu-amd-m5-2xlarge
    # TODO: Uncomment this when we have a way to test the deploy-test-vllm job in CI.
    #if: needs.changed-files.outputs.has_code_changes == 'true'
    if: github.event.inputs.run_deploy_operator
    needs: [changed-files, deploy-operator, vllm]
    permissions:
      contents: read
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        profile:
          - agg
          - agg_router
          - disagg
          - disagg_router
    name: deploy-test-vllm (${{ matrix.profile }})
    env:
      FRAMEWORK: vllm
      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
      DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
      MODEL_NAME: "Qwen/Qwen3-0.6B"
    steps: &deploy-test-steps
    - name: Output Node Name
      shell: bash
      run: |
        echo ${K8S_NODE_NAME}
    - uses: actions/checkout@v4
    - name: Setup Kubeconfig
      env:
        NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
      run: |
        set -x
        # Setup kubeconfig
        echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
        chmod 600 .kubeconfig
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
        kubectl config get-contexts
    - name: Run Tests
      id: run-tests
      env:
        NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
      run: |
        set -x
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE

        # Redirect all output to a log file while still showing it
        exec > >(tee -a test-output.log) 2>&1

        cd examples/backends/$FRAMEWORK
        export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
        export KUBE_NS=$NAMESPACE
        export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
        echo "GRAPH_NAME=${GRAPH_NAME}" >> $GITHUB_ENV
        # Update the deployment file in-place
        yq -i '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' $DEPLOYMENT_FILE

        # Debug: Show updated deployment file
        echo "=== UPDATED DEPLOYMENT FILE ==="
        cat $DEPLOYMENT_FILE

        # Apply the updated file
        kubectl apply -n $KUBE_NS -f $DEPLOYMENT_FILE

        # --- Wait for all pods in the dynamo graph deployment to be ready ---
        sleep 20
        # Get the deployment name from the file
        export GRAPH_NAME=$(yq e '.metadata.name' $DEPLOYMENT_FILE)
        echo "Waiting for all pods with label nvidia.com/dynamo-graph-deployment-name: $GRAPH_NAME"
        # Wait for all pods with the deployment label to be ready
        kubectl wait --for=condition=ready pod -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n ${KUBE_NS} --timeout=1800s

        # Debug: Show final pod statuses for the deployment
        echo "=== FINAL POD STATUSES ==="
        kubectl get pods -l "nvidia.com/dynamo-graph-deployment-name=$GRAPH_NAME" -n $KUBE_NS -o wide
        echo ""

        kubectl get all -n $KUBE_NS
        export FRONTEND_POD=$(kubectl get pods -n ${KUBE_NS} -l nvidia.com/dynamo-component-type=frontend,nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME} | tail -n1 | awk '{print $1}')
        export CONTAINER_PORT=$(kubectl get pod $FRONTEND_POD -n ${KUBE_NS} -o jsonpath='{.spec.containers[0].ports[?(@.name=="http")].containerPort}')
        echo "Container port is ${CONTAINER_PORT}"
        kubectl port-forward pod/$FRONTEND_POD 8000:${CONTAINER_PORT} -n ${KUBE_NS} &
        export LLM_URL="http://localhost:8000"
        sleep 10  # Give port-forward time to establish the connection
        echo "LLM URL: ${LLM_URL}"
        echo "MODEL NAME: ${MODEL_NAME}"
        # Wait until the model is available in the /v1/models response
        MAX_ATTEMPTS=30
        ATTEMPT=1
        while [ $ATTEMPT -le $MAX_ATTEMPTS ]; do
          MODELS_RESPONSE=$(curl -s --retry 5 --retry-delay 2 --retry-connrefused "${LLM_URL}/v1/models")
          if echo "$MODELS_RESPONSE" | jq -e --arg MODEL_NAME "$MODEL_NAME" '.data[]?.id == $MODEL_NAME' >/dev/null 2>&1; then
            echo "Model $MODEL_NAME is available in /v1/models"
            break
          fi
          echo "Waiting for model $MODEL_NAME to be available in /v1/models... (attempt $ATTEMPT/$MAX_ATTEMPTS)"
          sleep 5
          ATTEMPT=$((ATTEMPT + 1))
        done
        if [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then
          echo "Model $MODEL_NAME not found in /v1/models after $MAX_ATTEMPTS attempts"
          echo "Last response: $MODELS_RESPONSE"
          exit 1
        fi
        RESPONSE=$(curl -s -N --no-buffer --retry 10 --retry-delay 5 --retry-connrefused -X POST "${LLM_URL}/v1/chat/completions" \
          -H 'accept: text/event-stream' \
          -H 'Content-Type: application/json' \
          -d '{
            "model": "'"${MODEL_NAME:-Qwen/Qwen3-0.6B}"'",
            "messages": [
            {
                "role": "user",
                "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
            }
            ],
            "stream":false,
            "max_tokens": 30,
            "temperature": 0.0
          }' 2>&1)
        echo "Response: $RESPONSE"
        TEST_RESULT=0
        if ! echo "$RESPONSE" | jq -e . >/dev/null 2>&1; then
          echo "Test failed: Response is not valid JSON"
          echo "Got: $RESPONSE"
          TEST_RESULT=1
        elif ! echo "$RESPONSE" | jq -e '.choices[0].message.role == "assistant"' >/dev/null 2>&1; then
          echo "Test failed: Message role is not 'assistant'"
          echo "Got: $(echo "$RESPONSE" | jq '.choices[0].message.role')"
          TEST_RESULT=1
        elif ! echo "$RESPONSE" | jq -e '.model == "'"${MODEL_NAME}"'"' >/dev/null 2>&1; then
          echo "Test failed: Model name is incorrect"
          echo "Got: $(echo "$RESPONSE" | jq '.model')"
          TEST_RESULT=1
        elif ! echo "$RESPONSE" | jq -e '.choices[0].message.content | length > 100' >/dev/null 2>&1; then
          echo "Test failed: Response content length is not greater than 100 characters"
          echo "Got length: $(echo "$RESPONSE" | jq '.choices[0].message.content | length')"
          TEST_RESULT=1
        else
          echo "Test passed: Response matches expected format and content"
        fi
        exit $TEST_RESULT
      continue-on-error: true

    - name: Process Deployment Test Results
      if: always()
      run: |
        set -x

        # Create test-results directory
        mkdir -p test-results

        # Copy and rename the test output log with unique naming
        if [ -f "test-output.log" ]; then
          cp test-output.log "test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log"
          echo "✅ Test output log copied to test-results/"
        else
          echo "⚠️  test-output.log not found"
        fi

    - name: Upload Deployment Test Results
      uses: actions/upload-artifact@v4
      if: always()
      with:
        name: test-results-${{ env.FRAMEWORK }}-deploy-${{ matrix.profile }}-amd64-${{ github.run_id }}-${{ job.check_run_id }}
        path: test-results/deploy_test_output_${{ env.FRAMEWORK }}_${{ matrix.profile }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.log
        retention-days: 7

    - name: Cleanup
      if: always()
      timeout-minutes: 5
      env:
        NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
      run: |
        set -x
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE

        # For debugging purposes, list all the resources before we delete
        kubectl get all

        echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
        kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE || true

  deploy-test-sglang:
    runs-on: cpu-amd-m5-2xlarge
    # TODO: Uncomment this when we have a way to test the deploy-test-sglang job in CI.
    #if: needs.changed-files.outputs.has_code_changes == 'true'
    if: github.event.inputs.run_deploy_operator
    needs: [changed-files, deploy-operator, sglang]
    permissions:
      contents: read
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        profile:
          - agg
          - agg_router
    name: deploy-test-sglang (${{ matrix.profile }})
    env:
      FRAMEWORK: sglang
      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
      DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
      MODEL_NAME: "Qwen/Qwen3-0.6B"
    steps: *deploy-test-steps

  deploy-test-trtllm:
    runs-on: cpu-amd-m5-2xlarge
    # TODO: Uncomment this when we have a way to test the deploy-test-trtllm job in CI.
    #if: needs.changed-files.outputs.has_code_changes == 'true'
    if: github.event.inputs.run_deploy_operator
    needs: [changed-files, deploy-operator, trtllm]
    permissions:
      contents: read
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        profile:
          - agg
          - agg_router
          - disagg
          - disagg_router
    name: deploy-test-trtllm (${{ matrix.profile }})
    env:
      FRAMEWORK: trtllm
      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
      DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
      MODEL_NAME: "Qwen/Qwen3-0.6B"
    steps: *deploy-test-steps

  cleanup:
    runs-on: cpu-amd-m5-2xlarge
    # TODO: Uncomment the below if statement when we have a way to test the cleanup job in CI.
    # if: always()
    if: github.event.inputs.run_deploy_operator
    needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
    steps:
    - name: Output Node Name
      shell: bash
      run: |
        echo ${K8S_NODE_NAME}
    - uses: actions/checkout@v4
    - name: Setup Kubeconfig
      env:
        NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
      run: |
        set -x
        # Setup kubeconfig
        echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
        chmod 600 .kubeconfig
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
        kubectl config current-context
    - name: Cleanup
      timeout-minutes: 5
      env:
        NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
      run: |
        set -x
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE

        echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
        chmod 600 .kubeconfig
        export KUBECONFIG=$(pwd)/.kubeconfig
        kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"

        # For debugging purposes, list all the resources before we uninstall
        kubectl get all

        echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
        kubectl delete dynamographdeployments --all -n $NAMESPACE || true

        # Uninstall the helm chart
        helm ls
        helm uninstall dynamo-platform --namespace $NAMESPACE || true

        echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
        kubectl delete namespace $NAMESPACE || true
        echo "Namespace $NAMESPACE completed."