ci: Add Fault Tolerance K8s test (#3801)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

ci: Add Fault Tolerance K8s test (#3801)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
d1cf3c2c · Indrajit Bhosale · GitHub · 3ff5fa13 · d1cf3c2c · d1cf3c2c
Unverified Commit d1cf3c2c authored Nov 07, 2025 by Indrajit Bhosale Committed by GitHub Nov 07, 2025
5 changed files
--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -304,6 +304,145 @@ jobs:
          test_type: "e2e, gpu_1"
          platform_arch: ${{ matrix.platform.arch }}
+  deploy-test-fault-tolerance:
+    runs-on: cpu-amd-m5-2xlarge
+    if: needs.changed-files.outputs.has_code_changes == 'true'
+    needs: [changed-files, operator, vllm, trtllm, sglang]
+    permissions:
+      contents: read
+    strategy:
+      fail-fast: false
+      # Run matrix jobs sequentially to prevent a Helm race condition
+      # Parallel jobs conflict on ClusterRole ownership when installing the chart.
+      # Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
+      max-parallel: 1
+      matrix:
+        framework:
+          - { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
+          - { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
+          - { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
+    name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
+    env:
+      DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
+      - name: Set namespace
+        run: |
+          # Set namespace using test scenario
+          export FRAMEWORK=${{ matrix.framework.name }}
+          echo "NAMESPACE=gh-job-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
+          set -x
+          # Setup kubeconfig
+          echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
+          chmod 600 .kubeconfig
+          export KUBECONFIG=$(pwd)/.kubeconfig
+          kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
+          kubectl config current-context
+      - name: Deploy Operator
+        run: |
+          set -x
+          export KUBECONFIG=$(pwd)/.kubeconfig
+          # Create a namespace for this job
+          echo "Creating an ephemeral namespace..."
+          kubectl delete namespace $NAMESPACE || true
+          kubectl create namespace $NAMESPACE || true
+          echo "Attaching the labels for secrets and cleanup"
+          kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
+          # Set the namespace as default
+          kubectl config set-context --current --namespace=$NAMESPACE
+          # Check if Istio is installed
+          kubectl get pods -n istio-system
+          # Check if default storage class exists
+          kubectl get storageclass
+          # Install Helm chart
+          export VIRTUAL_ENV=/opt/dynamo/venv
+          export KUBE_NS=$NAMESPACE
+          export ISTIO_ENABLED=true
+          export ISTIO_GATEWAY=istio-system/ingress-alb
+          export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
+          export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
+          # Install dynamo env secrets
+          kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
+          # Create docker pull secret for operator image
+          kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
+          # Install helm dependencies
+          helm repo add bitnami https://charts.bitnami.com/bitnami
+          cd deploy/cloud/helm/platform/
+          helm dep build .
+          # Install platform with namespace restriction for single profile testing
+          helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
+            --set dynamo-operator.namespaceRestriction.enabled=true \
+            --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
+            --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
+            --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
+            --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
+            --timeout 10m --wait
+          # Wait for all deployments to be ready
+          timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
+          cd -
+          export KUBECONFIG=$(pwd)/.kubeconfig
+          kubectl config set-context --current --namespace=$NAMESPACE
+      - name: Run Fault Tolerance Tests
+        run: |
+          set -x
+          export KUBECONFIG=$(pwd)/.kubeconfig
+          export NAMESPACE=$NAMESPACE
+          export FRAMEWORK=${{ matrix.framework.name }}
+          export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
+          echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
+          echo "Using namespace: $NAMESPACE"
+          echo "Using image: $IMAGE"
+          # Install python3-venv package if not already installed
+          sudo apt-get update && sudo apt-get install -y python3-venv
+          # Set up Python virtual environment and install test dependencies
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install --upgrade pip
+          pip install -r container/deps/requirements.test.txt
+          pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
+          # Run the pytest command (tests orchestrate K8s, don't need dynamo package)
+          pytest tests/fault_tolerance/deploy/test_deployment.py \
+            -m 'k8s and fault_tolerance' \
+            -k '${{ matrix.framework.test_scenario }}' \
+            -s -v \
+            --namespace ${NAMESPACE} \
+            --image ${IMAGE} \
+            --client-type legacy
+      - name: Cleanup
+        if: always()
+        timeout-minutes: 5
+        run: |
+          echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
+          chmod 600 .kubeconfig
+          export KUBECONFIG=$(pwd)/.kubeconfig
+          kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
+          # For debugging purposes, list all the resources before we uninstall
+          kubectl get all
+          echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
+          kubectl delete dynamographdeployments --all -n $NAMESPACE || true
+          # Uninstall the helm chart
+          helm ls
+          helm uninstall dynamo-platform || true
+          echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
+          kubectl delete namespace $NAMESPACE || true
+          echo "Namespace $NAMESPACE completed."
  # Upload metrics for this workflow and all its jobs
  upload-workflow-metrics:
    name: Upload Workflow Metrics

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -200,7 +200,9 @@ markers = [
    "h100: marks tests to run on H100",
    "kvbm: marks tests for KV behavior and model determinism",
    "model: model id used by a test or parameter",
-    "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
+    "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
+    "k8s: marks tests as requiring Kubernetes",
+    "fault_tolerance: marks tests as fault tolerance tests"
 ]
 # Linting/formatting

--- a/tests/fault_tolerance/deploy/legacy_client.py
+++ b/tests/fault_tolerance/deploy/legacy_client.py
@@ -274,7 +274,7 @@ def client(
                )
                # Log result
-                logger.info(
+                logger.debug(
                    f"Request: {i} Pod {pod_name} Local Port {port} "
                    f"Status: {result['results'][-1]['status']} "
                    f"Latency: {result['results'][-1]['request_elapsed_time']}"

--- a/tests/fault_tolerance/deploy/test_deployment.py
+++ b/tests/fault_tolerance/deploy/test_deployment.py
@@ -340,6 +340,8 @@ def results_summary():
        logging.error(f"Failed to parse combined results: {e}")
+@pytest.mark.k8s
+@pytest.mark.fault_tolerance
 @pytest.mark.e2e
 @pytest.mark.slow
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")

--- a/tests/utils/managed_deployment.py
+++ b/tests/utils/managed_deployment.py
@@ -18,7 +18,22 @@ from kr8s.objects import Pod as kr8s_Pod
 from kr8s.objects import Service as kr8s_Service
 from kubernetes_asyncio import client, config
-from dynamo.common.utils.paths import get_workspace_dir
+def _get_workspace_dir() -> str:
+    """Get workspace directory without depending on dynamo.common package.
+    This allows tests to run without requiring dynamo package to be installed.
+    """
+    # Start from this file's location and walk up to find workspace root
+    current = os.path.dirname(os.path.abspath(__file__))
+    while current != os.path.dirname(current):  # Stop at filesystem root
+        # Workspace root has pyproject.toml
+        if os.path.exists(os.path.join(current, "pyproject.toml")):
+            return current
+        current = os.path.dirname(current)
+    # Fallback: assume workspace is 3 levels up from tests/utils/
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 class ServiceSpec:
@@ -877,8 +892,8 @@ async def main():
        datefmt=DATE_FORMAT,  # ISO 8601 UTC format
    )
-    # Get workspace directory using centralized logic
+    # Get workspace directory
-    workspace_dir = get_workspace_dir()
+    workspace_dir = _get_workspace_dir()
    deployment_spec = DeploymentSpec(
        os.path.join(workspace_dir, "examples/backends/vllm/deploy/agg.yaml")