Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
bf9c93c7
Unverified
Commit
bf9c93c7
authored
Dec 19, 2025
by
Anant Sharma
Committed by
GitHub
Dec 19, 2025
Browse files
ci: remove fault tolerance tests from pr checks (#5030)
Signed-off-by:
Anant Sharma
<
anants@nvidia.com
>
parent
4106b90f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
177 deletions
+0
-177
.github/workflows/container-validation-backends.yml
.github/workflows/container-validation-backends.yml
+0
-177
No files found.
.github/workflows/container-validation-backends.yml
View file @
bf9c93c7
...
...
@@ -314,183 +314,6 @@ jobs:
test_type
:
"
pre_merge"
platform_arch
:
${{ matrix.platform.arch }}
deploy-test-fault-tolerance
:
runs-on
:
cpu-amd-m5-2xlarge
if
:
needs.changed-files.outputs.has_code_changes == 'true'
needs
:
[
changed-files
,
operator
,
vllm
,
trtllm
,
sglang
]
permissions
:
contents
:
read
strategy
:
fail-fast
:
false
# Run matrix jobs sequentially to prevent a Helm race condition
# Parallel jobs conflict on ClusterRole ownership when installing the chart.
# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
max-parallel
:
1
matrix
:
framework
:
-
{
name
:
vllm
,
test_scenario
:
vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
-
{
name
:
trtllm
,
test_scenario
:
trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
-
{
name
:
sglang
,
test_scenario
:
sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
name
:
deploy-test-fault-tolerance (${{ matrix.framework.name }})
env
:
DYNAMO_INGRESS_SUFFIX
:
dev.aire.nvidia.com
steps
:
-
name
:
Output Node Name
shell
:
bash
run
:
|
echo ${K8S_NODE_NAME}
-
name
:
Checkout code
uses
:
actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955
# v4.3.0
-
name
:
Set namespace
run
:
|
# Set namespace using test scenario
export FRAMEWORK=${{ matrix.framework.name }}
echo "NAMESPACE=gh-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
-
name
:
Deploy Operator
run
:
|
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--timeout 10m --wait
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
-
name
:
Run Fault Tolerance Tests
id
:
run-ft-tests
run
:
|
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
export NAMESPACE=$NAMESPACE
export FRAMEWORK=${{ matrix.framework.name }}
export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
echo "Using namespace: $NAMESPACE"
echo "Using image: $IMAGE"
# Install python3-venv package if not already installed
sudo apt-get update && sudo apt-get install -y python3-venv
# Set up Python virtual environment and install test dependencies
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r container/deps/requirements.test.txt
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
# Create test-results directory
mkdir -p test-results
# Run the pytest command with JUnit XML output
set +e # Don't exit on test failures
pytest tests/fault_tolerance/deploy/test_deployment.py \
-m 'k8s and fault_tolerance' \
-k '${{ matrix.framework.test_scenario }}' \
-s -v \
--namespace ${NAMESPACE} \
--image ${IMAGE} \
--client-type legacy \
--junitxml=test-results/pytest_ft_report.xml \
--tb=short
TEST_EXIT_CODE=$?
echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> $GITHUB_ENV
echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE}
-
name
:
Process Fault Tolerance Test Results
if
:
always()
run
:
|
set -x
# Rename JUnit XML with unique naming if it exists
if [ -f "test-results/pytest_ft_report.xml" ]; then
mv "test-results/pytest_ft_report.xml" "test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml"
echo "✅ JUnit XML report renamed with unique identifier"
else
echo "⚠️ JUnit XML report not found"
fi
-
name
:
Upload Fault Tolerance Test Results
uses
:
actions/upload-artifact@v4
if
:
always()
with
:
name
:
test-results-${{ matrix.framework.name }}-fault_tolerance-amd64-${{ github.run_id }}-${{ job.check_run_id }}
path
:
test-results/pytest_ft_report_${{ matrix.framework.name }}_amd64_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days
:
7
-
name
:
Cleanup
if
:
always()
timeout-minutes
:
5
run
:
|
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
deploy-operator
:
runs-on
:
cpu-amd-m5-2xlarge
if
:
needs.changed-files.outputs.has_code_changes == 'true'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment