Unverified Commit aa3d2859 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: use vcluster in our CI to isolate dynamo operator deployments (#7478)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 062d3e6c
name: 'Check vCluster Exists'
description: 'Check whether a vCluster instance exists in the given namespace'
inputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for host cluster access'
required: true
vcluster_name:
description: 'Name of the vCluster to look for'
required: true
vcluster_namespace:
description: 'Host namespace to check'
required: true
outputs:
exists:
description: '"true" if the vCluster exists, "false" otherwise'
value: ${{ steps.check.outputs.exists }}
runs:
using: "composite"
steps:
- name: Install vCluster CLI
uses: ./.github/actions/install-vcluster-cli
- name: Check if vCluster exists
id: check
shell: bash
env:
VCLUSTER_NAME: ${{ inputs.vcluster_name }}
NAMESPACE: ${{ inputs.vcluster_namespace }}
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > /tmp/host-kubeconfig
chmod 600 /tmp/host-kubeconfig
export KUBECONFIG=/tmp/host-kubeconfig
RESULT=$(vcluster list --output json -n ${NAMESPACE})
if echo "$RESULT" | jq -e 'arrays | map(select(.Name == "'"${VCLUSTER_NAME}"'")) | length > 0' &>/dev/null; then
echo "exists=true" >> $GITHUB_OUTPUT
else
echo "exists=false" >> $GITHUB_OUTPUT
echo "vCluster not found in namespace ${NAMESPACE}"
fi
name: 'Connect to vCluster'
description: 'Establish a port-forward to a vCluster and output its kubeconfig as base64'
inputs:
host_kubeconfig_base64:
description: 'Base64-encoded kubeconfig for host cluster access'
required: true
vcluster_name:
description: 'Name of the vCluster to connect to'
required: true
vcluster_namespace:
description: 'Host namespace where the vCluster is running'
required: true
outputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for the vCluster (points to 127.0.0.1:8443 via port-forward)'
value: ${{ steps.connect.outputs.kubeconfig_base64 }}
runs:
using: "composite"
steps:
- name: Setup host kubeconfig
shell: bash
run: |
echo "${{ inputs.host_kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig-host
chmod 600 ${{ github.workspace }}/.kubeconfig-host
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig-host" >> $GITHUB_ENV
- name: Install vCluster CLI
uses: ./.github/actions/install-vcluster-cli
- name: Connect to vCluster
id: connect
shell: bash
env:
VCLUSTER_NAME: ${{ inputs.vcluster_name }}
NAMESPACE: ${{ inputs.vcluster_namespace }}
run: |
echo "::group::Port-forward and generate vCluster kubeconfig"
set -x
kubectl port-forward -n ${NAMESPACE} svc/${VCLUSTER_NAME} 8443:443 &
sleep 5
vcluster connect ${VCLUSTER_NAME} \
--namespace ${NAMESPACE} \
--server=https://127.0.0.1:8443 \
--print > ${{ github.workspace }}/.kubeconfig-vcluster
echo "Verifying vCluster connectivity..."
kubectl --kubeconfig=${{ github.workspace }}/.kubeconfig-vcluster get ns
KUBECONFIG_B64=$(base64 -w 0 < ${{ github.workspace }}/.kubeconfig-vcluster)
echo "kubeconfig_base64=${KUBECONFIG_B64}" >> $GITHUB_OUTPUT
echo "::endgroup::"
......@@ -10,10 +10,10 @@ inputs:
description: 'Kubernetes namespace for deployment'
required: true
registry:
description: 'Container registry hostname. Required for rerun self-bootstrap.'
description: 'Container registry hostname'
required: true
operator_tag:
description: 'Operator image tag (default: main-operator). Required for rerun self-bootstrap.'
description: 'Operator image tag (default: main-operator)'
required: false
default: 'main-operator'
hf_token:
......@@ -46,51 +46,11 @@ inputs:
runs:
using: "composite"
steps:
- name: Check if namespace exists
id: ns-check
shell: bash
env:
KUBECONFIG_B64: ${{ inputs.kubeconfig_base64 }}
NAMESPACE: ${{ inputs.namespace }}
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
run: |
echo "::group::Check if namespace exists"
echo "${KUBECONFIG_B64}" | base64 -d > ${{ github.workspace }}/.kubeconfig_check
chmod 600 ${{ github.workspace }}/.kubeconfig_check
if KUBECONFIG=${{ github.workspace }}/.kubeconfig_check kubectl get namespace $NAMESPACE --ignore-not-found | grep -q $NAMESPACE; then
echo "exists=true" >> $GITHUB_OUTPUT
echo "ns=${NAMESPACE}" >> $GITHUB_OUTPUT
echo "Namespace $NAMESPACE exists, will reuse it"
else
echo "exists=false" >> $GITHUB_OUTPUT
# Use check_run_id (unique per job+retry) to keep namespace under k8s 63-char label limit
SELF_NS="${NAMESPACE}-${{ job.check_run_id }}"
SELF_NS="${SELF_NS:0:36}"
SELF_NS="${SELF_NS%-}"
echo "ns=${SELF_NS}" >> $GITHUB_OUTPUT
echo "Namespace $NAMESPACE not found, will self-bootstrap as ${SELF_NS}"
fi
rm -f ${{ github.workspace }}/.kubeconfig_check
echo "::endgroup::"
- name: Setup namespace (self-bootstrap on rerun)
if: steps.ns-check.outputs.exists != 'true'
uses: ./.github/actions/setup-deploy-namespace
with:
kubeconfig_base64: ${{ inputs.kubeconfig_base64 }}
namespace: ${{ steps.ns-check.outputs.ns }}
registry: ${{ inputs.registry }}
operator_tag: ${{ inputs.operator_tag }}
hf_token: ${{ inputs.hf_token }}
dockerhub_username: ${{ inputs.dockerhub_username }}
dockerhub_password: ${{ inputs.dockerhub_password }}
- name: Setup Kubeconfig
id: setup-kubeconfig
shell: bash
env:
NAMESPACE: ${{ steps.ns-check.outputs.ns }}
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig
chmod 600 ${{ github.workspace }}/.kubeconfig
......@@ -117,7 +77,7 @@ runs:
shell: bash
env:
KUBECONFIG: ${{ github.workspace }}/.kubeconfig
NAMESPACE: ${{ steps.ns-check.outputs.ns }}
NAMESPACE: ${{ inputs.namespace }}
FRAMEWORK: ${{ inputs.framework }}
PROFILE: ${{ inputs.profile }}
IMAGE: ${{ inputs.image }}
......@@ -137,7 +97,7 @@ runs:
if: always()
shell: bash
env:
NAMESPACE: ${{ steps.ns-check.outputs.ns }}
NAMESPACE: ${{ inputs.namespace }}
GRAPH_NAME: ${{ steps.deploy.outputs.graph_name }}
run: |
echo "::group::Cleanup Deployment"
......@@ -163,13 +123,6 @@ runs:
fi
echo "::endgroup::"
- name: Teardown namespace (self-cleanup on rerun)
if: always() && steps.ns-check.outputs.exists != 'true'
uses: ./.github/actions/teardown-deploy-namespace
with:
kubeconfig_base64: ${{ inputs.kubeconfig_base64 }}
namespace: ${{ steps.ns-check.outputs.ns }}
- name: Upload Test Results
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f #v6
if: always()
......
name: 'Install vCluster CLI'
description: 'Download and install the vCluster CLI binary (architecture-aware)'
inputs:
vcluster_version:
description: 'vCluster CLI version to install'
required: false
default: 'v0.33.0'
runs:
using: "composite"
steps:
- name: Install vCluster CLI
shell: bash
run: |
echo "::group::Install vCluster CLI ${{ inputs.vcluster_version }}"
if command -v vcluster &>/dev/null; then
echo "vCluster CLI already installed: $(vcluster version)"
else
ARCH=$(uname -m)
case "${ARCH}" in
x86_64) VCLUSTER_ARCH="amd64" ;;
aarch64) VCLUSTER_ARCH="arm64" ;;
*) echo "Unsupported architecture: ${ARCH}"; exit 1 ;;
esac
curl -sL -o /tmp/vcluster \
"https://github.com/loft-sh/vcluster/releases/download/${{ inputs.vcluster_version }}/vcluster-linux-${VCLUSTER_ARCH}"
sudo mv /tmp/vcluster /usr/local/bin/vcluster
sudo chmod +x /usr/local/bin/vcluster
vcluster version
fi
echo "::endgroup::"
name: 'Setup Deploy Namespace'
description: 'Create a Kubernetes namespace and install the Dynamo platform operator via Helm'
inputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for cluster access'
required: true
namespace:
description: 'Target namespace name'
required: true
registry:
description: 'Container registry hostname (e.g. myregistry.azurecr.io)'
required: true
operator_tag:
description: 'Operator image tag (default: main-operator)'
required: false
default: 'main-operator'
hf_token:
description: 'HuggingFace token for model access'
required: false
default: ''
dockerhub_username:
description: 'Docker Hub username for image pull secrets'
required: false
default: ''
dockerhub_password:
description: 'Docker Hub password for image pull secrets'
required: false
default: ''
runs:
using: "composite"
steps:
- name: Setup Kubeconfig
shell: bash
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig
chmod 600 ${{ github.workspace }}/.kubeconfig
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV
- name: Create namespace
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::Create namespace $NAMESPACE"
set -x
kubectl create namespace $NAMESPACE
echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} \
nscleanup/enabled=true \
nscleanup/ttl=7200 \
gitlab-imagepull=enabled \
ngc-api=enabled \
nvcr-imagepull=enabled \
--overwrite=true
# Set the context to the new namespace
kubectl config set-context --current --namespace=$NAMESPACE
# Check if Istio is installed
kubectl get pods -n istio-system
# Check if default storage class exists
kubectl get storageclass
echo "::endgroup::"
- name: Create HF token secret
if: inputs.hf_token != ''
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
HF_TOKEN: ${{ inputs.hf_token }}
run: |
echo "::group::Create HF token secret"
kubectl create secret generic hf-token-secret \
--from-literal=HF_TOKEN=${HF_TOKEN} \
-n $NAMESPACE || true
echo "::endgroup::"
- name: Install Dynamo platform via Helm
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
REGISTRY: ${{ inputs.registry }}
OPERATOR_TAG: ${{ inputs.operator_tag }}
DOCKERHUB_USERNAME: ${{ inputs.dockerhub_username }}
DOCKERHUB_PASSWORD: ${{ inputs.dockerhub_password }}
run: |
echo "::group::Install Dynamo platform via Helm"
set -x
if [ -n "${DOCKERHUB_USERNAME}" ] && [ -n "${DOCKERHUB_PASSWORD}" ]; then
echo "Logging into Docker Hub registry"
helm registry login registry-1.docker.io -u ${DOCKERHUB_USERNAME} -p ${DOCKERHUB_PASSWORD}
fi
# Install Helm chart
export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true
export ISTIO_GATEWAY=istio-system/ingress-alb
export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true
OPERATOR_REPO="${REGISTRY}/ai-dynamo/dynamo"
echo "Using operator image: ${OPERATOR_REPO}:${OPERATOR_TAG}"
# Install helm dependencies
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
# Install platform with namespace restriction for single profile testing
# we manage crds via Velonix so we skip the crds installation
helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \
--skip-crds \
--set dynamo-operator.namespaceRestriction.enabled=true \
--set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \
--set dynamo-operator.controllerManager.manager.image.repository=${OPERATOR_REPO} \
--set dynamo-operator.controllerManager.manager.image.tag=${OPERATOR_TAG} \
--set dynamo-operator.gpuDiscovery.enabled=false \
--set dynamo-operator.upgradeCRD=false \
--debug
echo "::endgroup::"
- name: Wait for operator rollout
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::Wait for operator rollout"
kubectl rollout status deployment -n $NAMESPACE --watch --timeout=600s
echo "::endgroup::"
- name: Debug deployment failure
if: failure()
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
set +eo pipefail
echo "### OPERATOR DEPLOYMENT FAILED" | tee -a $GITHUB_STEP_SUMMARY
echo "::group::Pod status"
kubectl get pods -n "${NAMESPACE}" -o wide
echo "::endgroup::"
NOT_READY=$(kubectl get pods -n "${NAMESPACE}" --no-headers | awk -F'[/ ]+' '$2 != $3 || ($4 != "Running" && $4 != "Completed")')
if [ -n "$NOT_READY" ]; then
echo "$NOT_READY" | awk '{print "- **" $1 "** | Ready: `" $2 "` | Status: `" $3 "`"}' >> $GITHUB_STEP_SUMMARY
echo "$NOT_READY" | awk '{print $1}' | while read POD; do
echo "::group::describe pod/$POD"
kubectl describe pod "$POD" -n "${NAMESPACE}" 2>&1
echo "::endgroup::"
echo "::group::logs pod/$POD"
kubectl logs "$POD" -n "${NAMESPACE}" --all-containers --tail=80 2>&1
echo "::endgroup::"
done
fi
echo "::group::Namespace events (recent)"
kubectl get events -n "${NAMESPACE}" --sort-by='.lastTimestamp' 2>&1
echo "::endgroup::"
name: 'Setup Dynamo Operator'
description: 'Create a vCluster, install the Dynamo platform operator inside it via Helm'
inputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for host cluster access'
required: true
vcluster_name:
description: 'Name for the vCluster instance (auto-generated from github.run_id if empty)'
required: false
default: ''
vcluster_namespace:
description: 'Host namespace where the vCluster will be created (auto-generated if empty)'
required: false
default: ''
registry:
description: 'Container registry hostname (e.g. myregistry.azurecr.io)'
required: true
operator_tag:
description: 'Operator image tag (default: main-operator)'
required: false
default: 'main-operator'
hf_token:
description: 'HuggingFace token for model access'
required: false
default: ''
dockerhub_username:
description: 'Docker Hub username for helm registry login'
required: false
default: ''
dockerhub_password:
description: 'Docker Hub password for helm registry login'
required: false
default: ''
vcluster_k8s_version:
description: 'Kubernetes version for the vCluster control plane (must be supported by kr8s)'
required: false
default: 'v1.32.13'
outputs:
namespace:
description: 'Host namespace where the vCluster was created'
value: ${{ steps.resolve-names.outputs.namespace }}
vcluster_name:
description: 'Name of the created vCluster'
value: ${{ steps.resolve-names.outputs.vcluster_name }}
operator_tag:
description: 'Resolved operator tag'
value: ${{ steps.resolve-names.outputs.operator_tag }}
runs:
using: "composite"
steps:
- name: Resolve names
id: resolve-names
shell: bash
env:
INPUT_VCLUSTER_NAME: ${{ inputs.vcluster_name }}
INPUT_NAMESPACE: ${{ inputs.vcluster_namespace }}
BRANCH: ${{ github.ref_name }}
run: |
if [ -n "${INPUT_VCLUSTER_NAME}" ]; then
echo "vcluster_name=${INPUT_VCLUSTER_NAME}" >> "$GITHUB_OUTPUT"
else
echo "vcluster_name=ci-${{ github.run_id }}" >> "$GITHUB_OUTPUT"
fi
if [ -n "${INPUT_NAMESPACE}" ]; then
echo "namespace=${INPUT_NAMESPACE}" >> "$GITHUB_OUTPUT"
else
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
echo "namespace=gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt" >> "$GITHUB_OUTPUT"
fi
echo "operator_tag=${{ inputs.operator_tag }}" >> "$GITHUB_OUTPUT"
- name: Setup host kubeconfig
shell: bash
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig-host
chmod 600 ${{ github.workspace }}/.kubeconfig-host
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig-host" >> $GITHUB_ENV
- name: Install vCluster CLI
uses: ./.github/actions/install-vcluster-cli
- name: Create host namespace
shell: bash
env:
NAMESPACE: ${{ steps.resolve-names.outputs.namespace }}
run: |
echo "::group::Create host namespace $NAMESPACE"
set -x
kubectl create namespace $NAMESPACE --dry-run=client -o yaml | kubectl apply -f -
kubectl label namespaces ${NAMESPACE} \
nscleanup/enabled=true \
nscleanup/ttl=7200 \
ngc-api=enabled \
nvcr-imagepull=enabled \
--overwrite
echo "::endgroup::"
- name: Create vCluster
shell: bash
env:
VCLUSTER_NAME: ${{ steps.resolve-names.outputs.vcluster_name }}
NAMESPACE: ${{ steps.resolve-names.outputs.namespace }}
run: |
echo "::group::Create vCluster ${VCLUSTER_NAME} in ${NAMESPACE}"
set -x
# The K8s version must be supported by the kr8s Python library used in deploy tests.
# This is independent of the host cluster version.
vcluster create ${VCLUSTER_NAME} \
--namespace ${NAMESPACE} \
--connect=false \
--upgrade \
--set controlPlane.distro.k8s.enabled=true \
--set controlPlane.distro.k8s.version=${{ inputs.vcluster_k8s_version }}
echo "::endgroup::"
- name: Wait for vCluster pod to be ready
shell: bash
env:
VCLUSTER_NAME: ${{ steps.resolve-names.outputs.vcluster_name }}
NAMESPACE: ${{ steps.resolve-names.outputs.namespace }}
run: |
echo "::group::Wait for vCluster pod"
kubectl wait --for=condition=ready pod \
-l app=vcluster,release=${VCLUSTER_NAME} \
-n ${NAMESPACE} \
--timeout=300s
echo "::endgroup::"
- name: Connect to vCluster
id: connect-vcluster
uses: ./.github/actions/connect-vcluster
with:
host_kubeconfig_base64: ${{ inputs.kubeconfig_base64 }}
vcluster_name: ${{ steps.resolve-names.outputs.vcluster_name }}
vcluster_namespace: ${{ steps.resolve-names.outputs.namespace }}
- name: Create HF token secret
if: inputs.hf_token != ''
shell: bash
env:
HF_TOKEN: ${{ inputs.hf_token }}
run: |
echo "::group::Create HF token secret inside vCluster"
kubectl --kubeconfig=${{ github.workspace }}/.kubeconfig-vcluster \
create secret generic hf-token-secret \
--from-literal=HF_TOKEN=${HF_TOKEN} \
-n default || true
echo "::endgroup::"
- name: Login to Docker Hub for Helm
if: inputs.dockerhub_username != '' && inputs.dockerhub_password != ''
shell: bash
env:
DOCKERHUB_USER: ${{ inputs.dockerhub_username }}
DOCKERHUB_PASS: ${{ inputs.dockerhub_password }}
run: |
echo "${DOCKERHUB_PASS}" | helm registry login registry-1.docker.io -u "${DOCKERHUB_USER}" --password-stdin
- name: Install Dynamo platform via Helm
shell: bash
env:
REGISTRY: ${{ inputs.registry }}
OPERATOR_TAG: ${{ inputs.operator_tag }}
run: |
echo "::group::Install Dynamo platform via Helm (inside vCluster)"
set -x
OPERATOR_REPO="${REGISTRY}/ai-dynamo/dynamo"
echo "Using operator image: ${OPERATOR_REPO}:${OPERATOR_TAG}"
helm repo add bitnami https://charts.bitnami.com/bitnami
cd deploy/helm/charts/platform/
helm dep build .
KUBECONFIG=${{ github.workspace }}/.kubeconfig-vcluster \
helm upgrade --install dynamo-platform . --namespace default \
--set dynamo-operator.controllerManager.manager.image.repository=${OPERATOR_REPO} \
--set dynamo-operator.controllerManager.manager.image.tag=${OPERATOR_TAG} \
--set dynamo-operator.gpuDiscovery.enabled=false \
--set global.grove.install=true \
--set global.kai-scheduler.install=false \
--debug
echo "::endgroup::"
- name: Wait for operator rollout
shell: bash
run: |
echo "::group::Wait for operator rollout inside vCluster"
kubectl --kubeconfig=${{ github.workspace }}/.kubeconfig-vcluster \
rollout status deployment -n default --watch --timeout=600s
echo "::endgroup::"
- name: Debug deployment failure
if: failure()
shell: bash
env:
NAMESPACE: ${{ steps.resolve-names.outputs.namespace }}
run: |
set +eo pipefail
VKUBECONFIG=${{ github.workspace }}/.kubeconfig-vcluster
echo "### VCLUSTER OPERATOR DEPLOYMENT FAILED" | tee -a $GITHUB_STEP_SUMMARY
echo "::group::Pod status (vCluster)"
kubectl --kubeconfig=${VKUBECONFIG} get pods -A -o wide 2>&1 || true
echo "::endgroup::"
echo "::group::Pod status (host namespace)"
kubectl get pods -n ${NAMESPACE} -o wide 2>&1 || true
echo "::endgroup::"
NOT_READY=$(kubectl --kubeconfig=${VKUBECONFIG} get pods -n default --no-headers 2>/dev/null \
| awk -F'[/ ]+' '$2 != $3 || ($4 != "Running" && $4 != "Completed")')
if [ -n "$NOT_READY" ]; then
echo "$NOT_READY" | awk '{print "- **" $1 "** | Ready: `" $2 "` | Status: `" $3 "`"}' >> $GITHUB_STEP_SUMMARY
echo "$NOT_READY" | awk '{print $1}' | while read POD; do
echo "::group::describe pod/$POD"
kubectl --kubeconfig=${VKUBECONFIG} describe pod "$POD" -n default 2>&1
echo "::endgroup::"
echo "::group::logs pod/$POD"
kubectl --kubeconfig=${VKUBECONFIG} logs "$POD" -n default --all-containers --tail=80 2>&1
echo "::endgroup::"
done
fi
echo "::group::Events (vCluster default namespace)"
kubectl --kubeconfig=${VKUBECONFIG} get events -n default --sort-by='.lastTimestamp' 2>&1
echo "::endgroup::"
name: 'Teardown Deploy Namespace'
description: 'Clean up Dynamo platform resources and delete the Kubernetes namespace'
inputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for cluster access'
required: true
namespace:
description: 'Namespace to tear down'
required: true
runs:
using: "composite"
steps:
- name: Setup Kubeconfig
shell: bash
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig
chmod 600 ${{ github.workspace }}/.kubeconfig
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV
- name: Debug - List resources
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::List resources in namespace $NAMESPACE"
kubectl get dynamographdeployments -n $NAMESPACE || true
kubectl get all -n $NAMESPACE || true
echo "::endgroup::"
- name: Delete stale DynamoGraphDeployments
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::Delete stale DynamoGraphDeployments"
kubectl delete dynamographdeployments --all -n $NAMESPACE --timeout=60s
echo "::endgroup::"
- name: Uninstall Helm chart
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::Uninstall Helm chart"
if helm status dynamo-platform --namespace $NAMESPACE &>/dev/null; then
helm uninstall dynamo-platform --namespace $NAMESPACE --timeout 10m
else
echo "Helm release dynamo-platform not found, skipping"
fi
echo "::endgroup::"
- name: Delete namespace
shell: bash
env:
NAMESPACE: ${{ inputs.namespace }}
run: |
echo "::group::Delete namespace $NAMESPACE"
kubectl delete namespace $NAMESPACE --timeout=120s
echo "::endgroup::"
name: 'Teardown Dynamo Operator'
description: 'Delete a vCluster and its host namespace'
inputs:
kubeconfig_base64:
description: 'Base64-encoded kubeconfig for host cluster access'
required: true
vcluster_name:
description: 'Name of the vCluster to delete'
required: true
vcluster_namespace:
description: 'Host namespace where the vCluster is running'
required: true
runs:
using: "composite"
steps:
- name: Setup host kubeconfig
shell: bash
run: |
echo "${{ inputs.kubeconfig_base64 }}" | base64 -d > ${{ github.workspace }}/.kubeconfig
chmod 600 ${{ github.workspace }}/.kubeconfig
echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV
- name: Install vCluster CLI
uses: ./.github/actions/install-vcluster-cli
- name: Delete vCluster
shell: bash
env:
VCLUSTER_NAME: ${{ inputs.vcluster_name }}
NAMESPACE: ${{ inputs.vcluster_namespace }}
run: |
echo "::group::Delete vCluster ${VCLUSTER_NAME}"
vcluster delete ${VCLUSTER_NAME} --namespace ${NAMESPACE} || true
echo "::endgroup::"
- name: Delete host namespace
shell: bash
env:
NAMESPACE: ${{ inputs.vcluster_namespace }}
run: |
echo "::group::Delete namespace $NAMESPACE"
kubectl delete namespace ${NAMESPACE} --timeout=120s || true
echo "::endgroup::"
......@@ -297,161 +297,74 @@ jobs:
# ============================================================================
deploy-operator:
runs-on: prod-default-small-v2
needs: [operator]
runs-on: prod-default-small-v2
outputs:
NAMESPACE: ${{ steps.namespace.outputs.namespace }}
OPERATOR_TAG: ${{ steps.operator-tag.outputs.tag }}
namespace: ${{ steps.setup.outputs.namespace }}
vcluster_name: ${{ steps.setup.outputs.vcluster_name }}
operator_tag: ${{ steps.setup.outputs.operator_tag }}
steps:
- uses: actions/checkout@v4
- name: Determine operator tag
id: operator-tag
run: |
if [ "${{ needs.operator.result }}" == "success" ]; then
TAG="${{ needs.operator.outputs.operator_default_tag }}"
else
TAG="main-operator"
fi
echo "tag=${TAG}" >> $GITHUB_OUTPUT
echo "Using operator tag: ${TAG}"
- name: Generate namespace name
id: namespace
env:
BRANCH: ${{ github.ref_name }}
run: |
# Sanitize branch name for k8s namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
- name: Setup namespace and operator
uses: ./.github/actions/setup-deploy-namespace
- name: Setup vCluster and operator
id: setup
uses: ./.github/actions/setup-dynamo-operator
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ steps.namespace.outputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ steps.operator-tag.outputs.tag }}
operator_tag: ${{ needs.operator.outputs.operator_default_tag }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
# ============================================================================
# End-to-end tests for each framework with various deployment profiles
# ============================================================================
deploy-test-vllm:
runs-on: prod-default-small-v2
needs: [deploy-operator, vllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
platform_arch: amd64
framework: vllm
profiles: '["agg", "agg_router", "disagg", "disagg_router"]'
image_suffix: vllm-runtime-cuda12-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-test-sglang:
runs-on: prod-default-small-v2
needs: [deploy-operator, sglang-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
name: deploy-test-sglang (${{ matrix.profile }})
env:
FRAMEWORK: sglang
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-runtime-cuda12-amd64
platform_arch: amd64
framework: sglang
profiles: '["agg", "agg_router"]'
image_suffix: sglang-runtime-cuda12-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-test-trtllm:
runs-on: prod-default-small-v2
needs: [deploy-operator, trtllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
# Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate.
# Re-enable once the underlying disagg deployment issue is resolved.
# - disagg
# - disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
framework: trtllm
profiles: '["agg", "agg_router"]'
image_suffix: trtllm-runtime-cuda13-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-cleanup:
if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
runs-on: prod-default-small-v2
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
- uses: actions/checkout@v4
- name: Teardown vCluster
if: needs.deploy-operator.outputs.namespace != '' && needs.deploy-operator.outputs.vcluster_name != ''
uses: ./.github/actions/teardown-dynamo-operator
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-runtime-cuda13-amd64
platform_arch: amd64
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }}
deploy-status-check:
runs-on: ubuntu-latest
......@@ -460,7 +373,7 @@ jobs:
steps:
- name: "Check all deploy test jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))'
# ============================================================================
# CLEANUP JOBS
......@@ -486,21 +399,6 @@ jobs:
run: |
docker buildx rm b-${{ github.run_id }}-${{ github.run_attempt }} || true
cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Teardown namespace
if: needs.deploy-operator.outputs.NAMESPACE != ''
uses: ./.github/actions/teardown-deploy-namespace
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
......
......@@ -73,7 +73,7 @@ jobs:
steps:
- name: "Check all deploy test jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))'
# ============================================================================
......@@ -306,183 +306,96 @@ jobs:
# ============================================================================
deploy-operator:
runs-on: prod-default-small-v2
# Run when any deploy test will run: if any framework or deploy files changed
if: |
!cancelled() &&
!cancelled() && !failure() &&
(needs.changed-files.outputs.vllm == 'true' ||
needs.changed-files.outputs.sglang == 'true' ||
needs.changed-files.outputs.trtllm == 'true' ||
needs.changed-files.outputs.deploy == 'true') &&
(needs.operator.result == 'success' || needs.operator.result == 'skipped')
needs: [changed-files, operator]
runs-on: prod-default-small-v2
outputs:
NAMESPACE: ${{ steps.namespace.outputs.namespace }}
OPERATOR_TAG: ${{ steps.operator-tag.outputs.tag }}
namespace: ${{ steps.setup.outputs.namespace }}
vcluster_name: ${{ steps.setup.outputs.vcluster_name }}
operator_tag: ${{ steps.setup.outputs.operator_tag }}
steps:
- uses: actions/checkout@v4
- name: Determine operator tag
id: operator-tag
run: |
if [ "${{ needs.operator.result }}" == "success" ]; then
TAG="${{ needs.operator.outputs.operator_default_tag }}"
else
TAG="main-operator"
fi
echo "tag=${TAG}" >> $GITHUB_OUTPUT
echo "Using operator tag: ${TAG}"
- name: Generate namespace name
id: namespace
env:
BRANCH: ${{ github.ref_name }}
run: |
# Sanitize branch name for k8s namespace
# Invalid patterns: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
BRANCH_SANITIZED="${BRANCH//\//-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED/pull-request/pr}"
BRANCH_SANITIZED="${BRANCH_SANITIZED//./-}"
BRANCH_SANITIZED="${BRANCH_SANITIZED:0:10}"
NAMESPACE="gh-id-${{ github.run_id }}-${BRANCH_SANITIZED}-dt"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
- name: Setup namespace and operator
uses: ./.github/actions/setup-deploy-namespace
- name: Setup vCluster and operator
id: setup
uses: ./.github/actions/setup-dynamo-operator
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ steps.namespace.outputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ steps.operator-tag.outputs.tag }}
operator_tag: ${{ needs.operator.result == 'success' && needs.operator.outputs.operator_default_tag || 'main-operator' }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
# ============================================================================
#
# End-to-end tests for each framework with various deployment profiles
# ============================================================================
deploy-test-vllm:
# !cancelled() && !failure() is required because reusable workflows with skipped
# internal jobs (e.g. multi-gpu tests) propagate non-success through `needs`,
# auto-skipping dependents. See: https://github.com/orgs/community/discussions/189172
if: |
!cancelled() && !failure() &&
(needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true') &&
(needs.vllm-pipeline.result == 'success')
runs-on: prod-default-small-v2
needs.deploy-operator.result == 'success' &&
needs.vllm-pipeline.result == 'success'
needs: [changed-files, deploy-operator, vllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-vllm-runtime-cuda12-amd64
platform_arch: amd64
framework: vllm
profiles: '["agg", "agg_router", "disagg", "disagg_router"]'
image_suffix: vllm-runtime-cuda12-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-test-sglang:
runs-on: prod-default-small-v2
if: |
!cancelled() && !failure() &&
(needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true') &&
(needs.sglang-pipeline.result == 'success')
needs.deploy-operator.result == 'success' &&
needs.sglang-pipeline.result == 'success'
needs: [changed-files, deploy-operator, sglang-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
name: deploy-test-sglang (${{ matrix.profile }})
env:
FRAMEWORK: sglang
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-sglang-runtime-cuda12-amd64
platform_arch: amd64
framework: sglang
profiles: '["agg", "agg_router"]'
image_suffix: sglang-runtime-cuda12-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-test-trtllm:
runs-on: prod-default-small-v2
if: |
!cancelled() && !failure() &&
(needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true') &&
(needs.trtllm-pipeline.result == 'success')
needs.deploy-operator.result == 'success' &&
needs.trtllm-pipeline.result == 'success'
needs: [changed-files, deploy-operator, trtllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile:
- agg
- agg_router
# Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate.
# Re-enable once the underlying disagg deployment issue is resolved.
# - disagg
# - disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
uses: ./.github/workflows/shared-deploy-test-framework.yml
with:
framework: trtllm
profiles: '["agg", "agg_router"]'
image_suffix: trtllm-runtime-cuda13-amd64
namespace: ${{ needs.deploy-operator.outputs.namespace }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
operator_tag: ${{ needs.deploy-operator.outputs.operator_tag }}
secrets: inherit
deploy-cleanup:
if: always()
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
runs-on: prod-default-small-v2
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
- uses: actions/checkout@v4
- name: Teardown vCluster
if: needs.deploy-operator.outputs.namespace != '' && needs.deploy-operator.outputs.vcluster_name != ''
uses: ./.github/actions/teardown-dynamo-operator
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.deploy-operator.outputs.OPERATOR_TAG }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
framework: ${{ env.FRAMEWORK }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-trtllm-runtime-cuda13-amd64
platform_arch: amd64
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }}
# ============================================================================
# CLEANUP JOBS
......@@ -508,19 +421,3 @@ jobs:
shell: bash
run: |
docker buildx rm ${{ needs.changed-files.outputs.builder_name }} || true
cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Teardown namespace
if: needs.deploy-operator.outputs.NAMESPACE != ''
uses: ./.github/actions/teardown-deploy-namespace
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
namespace: ${{ needs.deploy-operator.outputs.NAMESPACE }}
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Deploy Test Framework
on:
workflow_call:
inputs:
framework:
description: 'Framework to test (vllm, sglang, trtllm)'
type: string
required: true
profiles:
description: 'JSON array of deployment profiles to test'
type: string
required: true
image_suffix:
description: 'Image suffix (e.g. vllm-runtime-cuda12-amd64)'
type: string
required: true
namespace:
description: 'Host namespace where the vCluster lives'
type: string
required: true
vcluster_name:
description: 'Name of the vCluster'
type: string
required: true
operator_tag:
description: 'Operator image tag'
type: string
required: true
jobs:
deploy-test:
runs-on: prod-default-small-v2
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 2
matrix:
profile: ${{ fromJSON(inputs.profiles) }}
name: deploy-test-${{ inputs.framework }} (${{ matrix.profile }})
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Check if vCluster exists
id: vcluster-check
uses: ./.github/actions/check-vcluster-exists
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
vcluster_name: ${{ inputs.vcluster_name }}
vcluster_namespace: ${{ inputs.namespace }}
- name: Self-bootstrap vCluster (rerun)
if: steps.vcluster-check.outputs.exists != 'true'
uses: ./.github/actions/setup-dynamo-operator
with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
vcluster_name: ${{ inputs.vcluster_name }}
vcluster_namespace: ${{ inputs.namespace }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ inputs.operator_tag }}
hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
- name: Connect to vCluster
id: connect-vcluster
uses: ./.github/actions/connect-vcluster
with:
host_kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
vcluster_name: ${{ inputs.vcluster_name }}
vcluster_namespace: ${{ inputs.namespace }}
- name: Run Dynamo Deploy Test
id: deploy-test
uses: ./.github/actions/dynamo-deploy-test
with:
kubeconfig_base64: ${{ steps.connect-vcluster.outputs.kubeconfig_base64 }}
namespace: default
registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ inputs.operator_tag }}
hf_token: ${{ secrets.HF_TOKEN }}
framework: ${{ inputs.framework }}
profile: ${{ matrix.profile }}
image: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${{ inputs.image_suffix }}
platform_arch: amd64
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment