Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d1cf3c2c
Unverified
Commit
d1cf3c2c
authored
Nov 07, 2025
by
Indrajit Bhosale
Committed by
GitHub
Nov 07, 2025
Browse files
ci: Add Fault Tolerance K8s test (#3801)
Signed-off-by:
Indrajit Bhosale
<
iamindrajitb@gmail.com
>
parent
3ff5fa13
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
163 additions
and
5 deletions
+163
-5
.github/workflows/container-validation-backends.yml
.github/workflows/container-validation-backends.yml
+139
-0
pyproject.toml
pyproject.toml
+3
-1
tests/fault_tolerance/deploy/legacy_client.py
tests/fault_tolerance/deploy/legacy_client.py
+1
-1
tests/fault_tolerance/deploy/test_deployment.py
tests/fault_tolerance/deploy/test_deployment.py
+2
-0
tests/utils/managed_deployment.py
tests/utils/managed_deployment.py
+18
-3
No files found.
.github/workflows/container-validation-backends.yml
View file @
d1cf3c2c
...
...
@@ -304,6 +304,145 @@ jobs:
test_type
:
"
e2e,
gpu_1"
platform_arch
:
${{ matrix.platform.arch }}
deploy-test-fault-tolerance
:
runs-on
:
cpu-amd-m5-2xlarge
if
:
needs.changed-files.outputs.has_code_changes == 'true'
needs
:
[
changed-files
,
operator
,
vllm
,
trtllm
,
sglang
]
permissions
:
contents
:
read
strategy
:
fail-fast
:
false
# Run matrix jobs sequentially to prevent a Helm race condition
# Parallel jobs conflict on ClusterRole ownership when installing the chart.
# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
max-parallel
:
1
matrix
:
framework
:
-
{
name
:
vllm
,
test_scenario
:
vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
-
{
name
:
trtllm
,
test_scenario
:
trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
-
{
name
:
sglang
,
test_scenario
:
sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod
}
name
:
deploy-test-fault-tolerance (${{ matrix.framework.name }})
env
:
DYNAMO_INGRESS_SUFFIX
:
dev.aire.nvidia.com
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955
# v4.3.0
-
name
:
Set namespace
run
:
|
# Set namespace using test scenario
export FRAMEWORK=${{ matrix.framework.name }}
echo "NAMESPACE=gh-job-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
-
name
:
Deploy Operator
run
:
|
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--timeout 10m --wait
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
-
name
:
Run Fault Tolerance Tests
run
:
|
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
export NAMESPACE=$NAMESPACE
export FRAMEWORK=${{ matrix.framework.name }}
export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
echo "Using namespace: $NAMESPACE"
echo "Using image: $IMAGE"
# Install python3-venv package if not already installed
sudo apt-get update && sudo apt-get install -y python3-venv
# Set up Python virtual environment and install test dependencies
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r container/deps/requirements.test.txt
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
# Run the pytest command (tests orchestrate K8s, don't need dynamo package)
pytest tests/fault_tolerance/deploy/test_deployment.py \
-m 'k8s and fault_tolerance' \
-k '${{ matrix.framework.test_scenario }}' \
-s -v \
--namespace ${NAMESPACE} \
--image ${IMAGE} \
--client-type legacy
-
name
:
Cleanup
if
:
always()
timeout-minutes
:
5
run
:
|
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
# Upload metrics for this workflow and all its jobs
upload-workflow-metrics
:
name
:
Upload Workflow Metrics
...
...
pyproject.toml
View file @
d1cf3c2c
...
...
@@ -200,7 +200,9 @@ markers = [
"h100: marks tests to run on H100"
,
"kvbm: marks tests for KV behavior and model determinism"
,
"model: model id used by a test or parameter"
,
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
,
"k8s: marks tests as requiring Kubernetes"
,
"fault_tolerance: marks tests as fault tolerance tests"
]
# Linting/formatting
...
...
tests/fault_tolerance/deploy/legacy_client.py
View file @
d1cf3c2c
...
...
@@ -274,7 +274,7 @@ def client(
)
# Log result
logger
.
info
(
logger
.
debug
(
f
"Request:
{
i
}
Pod
{
pod_name
}
Local Port
{
port
}
"
f
"Status:
{
result
[
'results'
][
-
1
][
'status'
]
}
"
f
"Latency:
{
result
[
'results'
][
-
1
][
'request_elapsed_time'
]
}
"
...
...
tests/fault_tolerance/deploy/test_deployment.py
View file @
d1cf3c2c
...
...
@@ -340,6 +340,8 @@ def results_summary():
logging
.
error
(
f
"Failed to parse combined results:
{
e
}
"
)
@
pytest
.
mark
.
k8s
@
pytest
.
mark
.
fault_tolerance
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
slow
@
pytest
.
mark
.
filterwarnings
(
"ignore::DeprecationWarning"
)
...
...
tests/utils/managed_deployment.py
View file @
d1cf3c2c
...
...
@@ -18,7 +18,22 @@ from kr8s.objects import Pod as kr8s_Pod
from
kr8s.objects
import
Service
as
kr8s_Service
from
kubernetes_asyncio
import
client
,
config
from
dynamo.common.utils.paths
import
get_workspace_dir
def
_get_workspace_dir
()
->
str
:
"""Get workspace directory without depending on dynamo.common package.
This allows tests to run without requiring dynamo package to be installed.
"""
# Start from this file's location and walk up to find workspace root
current
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
while
current
!=
os
.
path
.
dirname
(
current
):
# Stop at filesystem root
# Workspace root has pyproject.toml
if
os
.
path
.
exists
(
os
.
path
.
join
(
current
,
"pyproject.toml"
)):
return
current
current
=
os
.
path
.
dirname
(
current
)
# Fallback: assume workspace is 3 levels up from tests/utils/
return
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))))
class
ServiceSpec
:
...
...
@@ -877,8 +892,8 @@ async def main():
datefmt
=
DATE_FORMAT
,
# ISO 8601 UTC format
)
# Get workspace directory
using centralized logic
workspace_dir
=
get_workspace_dir
()
# Get workspace directory
workspace_dir
=
_
get_workspace_dir
()
deployment_spec
=
DeploymentSpec
(
os
.
path
.
join
(
workspace_dir
,
"examples/backends/vllm/deploy/agg.yaml"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment