Unverified Commit 9d765839 authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: OPS-1745: Deploy operator once for deploy tests (#4022)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
Signed-off-by: default avatarDillon Cullinan <dcullinan92@gmail.com>
parent b73e6eb5
...@@ -341,51 +341,38 @@ jobs: ...@@ -341,51 +341,38 @@ jobs:
# Upload complete workflow metrics including container metrics # Upload complete workflow metrics including container metrics
python3 .github/workflows/upload_complete_workflow_metrics.py python3 .github/workflows/upload_complete_workflow_metrics.py
deploy-test-vllm: deploy-operator:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, vllm] needs: [changed-files, operator]
permissions:
contents: read
strategy:
fail-fast: false
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env: env:
FRAMEWORK: vllm
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml" outputs:
MODEL_NAME: "Qwen/Qwen3-0.6B" NAMESPACE: ${{ steps.deploy-operator-step.outputs.namespace }}
steps: &deploy-test-steps steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set namespace - name: Deploy Operator
id: deploy-operator-step
env:
BRANCH: ${{ github.ref_name }}
run: | run: |
# Set namespace using FRAMEWORK env var
PROFILE_SANITIZED="${{ matrix.profile }}"
PROFILE_SANITIZED="${PROFILE_SANITIZED//_/-}"
echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV
set -x set -x
# Set namespace using branch
BRANCH_SANITIZED="${BRANCH/\//-}"
NAMESPACE="gh-job-id-${{ github.run_id }}-${BRANCH_SANITIZED}-deploy-tests"
echo "namespace=${NAMESPACE}" >> "$GITHUB_OUTPUT"
# Setup kubeconfig # Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context kubectl config current-context
- name: Deploy Operator
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
# Create a namespace for this job # Create a namespace for this job
echo "Creating an ephemeral namespace..." echo "Creating an ephemeral namespace..."
kubectl delete namespace $NAMESPACE || true kubectl create namespace $NAMESPACE
kubectl create namespace $NAMESPACE || true
echo "Attaching the labels for secrets and cleanup" echo "Attaching the labels for secrets and cleanup"
kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true
...@@ -398,8 +385,6 @@ jobs: ...@@ -398,8 +385,6 @@ jobs:
kubectl get storageclass kubectl get storageclass
# Install Helm chart # Install Helm chart
export IMAGE_TAG=$(cat build.env)
echo $IMAGE_TAG
export VIRTUAL_ENV=/opt/dynamo/venv export VIRTUAL_ENV=/opt/dynamo/venv
export KUBE_NS=$NAMESPACE export KUBE_NS=$NAMESPACE
export ISTIO_ENABLED=true export ISTIO_ENABLED=true
...@@ -424,8 +409,45 @@ jobs: ...@@ -424,8 +409,45 @@ jobs:
--set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret
# Wait for all deployments to be ready # Wait for all deployments to be ready
timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch
cd -
deploy-test-vllm:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, deploy-operator, vllm]
permissions:
contents: read
strategy:
fail-fast: false
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: &deploy-test-steps
- uses: actions/checkout@v4
- name: Setup Kubeconfig
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config get-contexts
- name: Run Tests
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE kubectl config set-context --current --namespace=$NAMESPACE
...@@ -523,30 +545,25 @@ jobs: ...@@ -523,30 +545,25 @@ jobs:
- name: Cleanup - name: Cleanup
if: always() if: always()
timeout-minutes: 5 timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
PROFILE: ${{ matrix.profile }}
run: | run: |
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig set -x
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config set-context --current --namespace=$NAMESPACE
# For debugging purposes, list all the resources before we uninstall # For debugging purposes, list all the resources before we delete
kubectl get all kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..." echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true PROFILE_SANITIZED="${PROFILE/_/-}"
kubectl delete dynamographdeployments $FRAMEWORK-$PROFILE_SANITIZED -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
deploy-test-sglang: deploy-test-sglang:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, sglang] needs: [changed-files, deploy-operator, sglang]
permissions: permissions:
contents: read contents: read
strategy: strategy:
...@@ -566,7 +583,7 @@ jobs: ...@@ -566,7 +583,7 @@ jobs:
deploy-test-trtllm: deploy-test-trtllm:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true' if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, operator, trtllm] needs: [changed-files, deploy-operator, trtllm]
permissions: permissions:
contents: read contents: read
strategy: strategy:
...@@ -584,3 +601,48 @@ jobs: ...@@ -584,3 +601,48 @@ jobs:
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml" DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B" MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: *deploy-test-steps steps: *deploy-test-steps
cleanup:
runs-on: cpu-amd-m5-2xlarge
if: always()
needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
steps:
- uses: actions/checkout@v4
- name: Setup Kubeconfig
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
# Setup kubeconfig
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
kubectl config current-context
- name: Cleanup
timeout-minutes: 5
env:
NAMESPACE: ${{ needs.deploy-operator.outputs.NAMESPACE }}
run: |
set -x
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE
echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig
chmod 600 .kubeconfig
export KUBECONFIG=$(pwd)/.kubeconfig
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall
kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
kubectl delete dynamographdeployments --all -n $NAMESPACE || true
# Uninstall the helm chart
helm ls
helm uninstall dynamo-platform --namespace $NAMESPACE || true
echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..."
kubectl delete namespace $NAMESPACE || true
echo "Namespace $NAMESPACE completed."
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment