Unverified Commit d1cf3c2c authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

ci: Add Fault Tolerance K8s test (#3801)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 3ff5fa13
...@@ -304,6 +304,145 @@ jobs: ...@@ -304,6 +304,145 @@ jobs:
test_type: "e2e, gpu_1" test_type: "e2e, gpu_1"
platform_arch: ${{ matrix.platform.arch }} platform_arch: ${{ matrix.platform.arch }}
deploy-test-fault-tolerance:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, vllm, trtllm, sglang]
permissions:
contents: read
strategy:
fail-fast: false
# Run matrix jobs sequentially to prevent a Helm race condition
# Parallel jobs conflict on ClusterRole ownership when installing the chart.
# Error: ClusterRole "...-operator" exists... cannot be imported... current value is "...-ft-vllm"
max-parallel: 1
matrix:
framework:
- { name: vllm, test_scenario: vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
- { name: trtllm, test_scenario: trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
- { name: sglang, test_scenario: sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod }
name: deploy-test-fault-tolerance (${{ matrix.framework.name }})
env:
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Set namespace
run: |
# Set namespace using test scenario
export FRAMEWORK=${{ matrix.framework.name }}
echo "NAMESPACE=gh-job-id-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
- name: Deploy Operator
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job
echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
# Set the namespace as default
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX}
# Install dynamo env secrets
kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true
# Create docker pull secret for operator image
kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE}
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/cloud/helm/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \
--set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret \
--timeout 10m --wait
# Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
- name: Run Fault Tolerance Tests
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
export NAMESPACE=$NAMESPACE
export FRAMEWORK=${{ matrix.framework.name }}
export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64"
echo "Running fault tolerance test: ${{ matrix.framework.test_scenario }}"
echo "Using namespace: $NAMESPACE"
echo "Using image: $IMAGE"
# Install python3-venv package if not already installed
sudo apt-get update && sudo apt-get install -y python3-venv
# Set up Python virtual environment and install test dependencies
python3 -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r container/deps/requirements.test.txt
pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic
# Run the pytest command (tests orchestrate K8s, don't need dynamo package)
pytest tests/fault_tolerance/deploy/test_deployment.py \
-m 'k8s and fault_tolerance' \
-k '${{ matrix.framework.test_scenario }}' \
-s -v \
--namespace ${NAMESPACE} \
--image ${IMAGE} \
--client-type legacy
- name: Cleanup
if: always()
timeout-minutes: 5
run: |
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
# Upload metrics for this workflow and all its jobs # Upload metrics for this workflow and all its jobs
upload-workflow-metrics: upload-workflow-metrics:
name: Upload Workflow Metrics name: Upload Workflow Metrics
......
...@@ -200,7 +200,9 @@ markers = [ ...@@ -200,7 +200,9 @@ markers = [
"h100: marks tests to run on H100", "h100: marks tests to run on H100",
"kvbm: marks tests for KV behavior and model determinism", "kvbm: marks tests for KV behavior and model determinism",
"model: model id used by a test or parameter", "model: model id used by a test or parameter",
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)" "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
"k8s: marks tests as requiring Kubernetes",
"fault_tolerance: marks tests as fault tolerance tests"
] ]
# Linting/formatting # Linting/formatting
......
...@@ -274,7 +274,7 @@ def client( ...@@ -274,7 +274,7 @@ def client(
) )
# Log result # Log result
logger.info( logger.debug(
f"Request: {i} Pod {pod_name} Local Port {port} " f"Request: {i} Pod {pod_name} Local Port {port} "
f"Status: {result['results'][-1]['status']} " f"Status: {result['results'][-1]['status']} "
f"Latency: {result['results'][-1]['request_elapsed_time']}" f"Latency: {result['results'][-1]['request_elapsed_time']}"
......
...@@ -340,6 +340,8 @@ def results_summary(): ...@@ -340,6 +340,8 @@ def results_summary():
logging.error(f"Failed to parse combined results: {e}") logging.error(f"Failed to parse combined results: {e}")
@pytest.mark.k8s
@pytest.mark.fault_tolerance
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.slow @pytest.mark.slow
@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.filterwarnings("ignore::DeprecationWarning")
......
...@@ -18,7 +18,22 @@ from kr8s.objects import Pod as kr8s_Pod ...@@ -18,7 +18,22 @@ from kr8s.objects import Pod as kr8s_Pod
from kr8s.objects import Service as kr8s_Service from kr8s.objects import Service as kr8s_Service
from kubernetes_asyncio import client, config from kubernetes_asyncio import client, config
from dynamo.common.utils.paths import get_workspace_dir
def _get_workspace_dir() -> str:
"""Get workspace directory without depending on dynamo.common package.
This allows tests to run without requiring dynamo package to be installed.
"""
# Start from this file's location and walk up to find workspace root
current = os.path.dirname(os.path.abspath(__file__))
while current != os.path.dirname(current): # Stop at filesystem root
# Workspace root has pyproject.toml
if os.path.exists(os.path.join(current, "pyproject.toml")):
return current
current = os.path.dirname(current)
# Fallback: assume workspace is 3 levels up from tests/utils/
return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
class ServiceSpec: class ServiceSpec:
...@@ -877,8 +892,8 @@ async def main(): ...@@ -877,8 +892,8 @@ async def main():
datefmt=DATE_FORMAT, # ISO 8601 UTC format datefmt=DATE_FORMAT, # ISO 8601 UTC format
) )
# Get workspace directory using centralized logic # Get workspace directory
workspace_dir = get_workspace_dir() workspace_dir = _get_workspace_dir()
deployment_spec = DeploymentSpec( deployment_spec = DeploymentSpec(
os.path.join(workspace_dir, "examples/backends/vllm/deploy/agg.yaml") os.path.join(workspace_dir, "examples/backends/vllm/deploy/agg.yaml")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment