"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "5755a8dec365b6d13765db18498a4e8ba76fa377"
Unverified Commit 181ec2ba authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: OPS-2557: Enable deploy tests now that we have better layer deduplication (#4841)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 0fba01c2
...@@ -169,13 +169,6 @@ jobs: ...@@ -169,13 +169,6 @@ jobs:
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Login to Container Registries
uses: ./.github/actions/docker-login
with:
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
- name: Docker Tag and Push - name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push uses: ./.github/actions/docker-tag-push
with: with:
...@@ -452,8 +445,6 @@ jobs: ...@@ -452,8 +445,6 @@ jobs:
echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}" echo "🧪 Fault tolerance tests completed with exit code: ${TEST_EXIT_CODE}"
exit ${TEST_EXIT_CODE} exit ${TEST_EXIT_CODE}
continue-on-error: true
- name: Process Fault Tolerance Test Results - name: Process Fault Tolerance Test Results
if: always() if: always()
run: | run: |
...@@ -500,9 +491,7 @@ jobs: ...@@ -500,9 +491,7 @@ jobs:
deploy-operator: deploy-operator:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-operator job in CI. if: needs.changed-files.outputs.has_code_changes == 'true'
#if: needs.changed-files.outputs.has_code_changes == 'true'
if: github.event.inputs.run_deploy_operator
needs: [changed-files, operator, vllm, sglang, trtllm] needs: [changed-files, operator, vllm, sglang, trtllm]
env: env:
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
...@@ -578,9 +567,8 @@ jobs: ...@@ -578,9 +567,8 @@ jobs:
deploy-test-vllm: deploy-test-vllm:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-vllm job in CI. # Run if push to main, or manually triggered
#if: needs.changed-files.outputs.has_code_changes == 'true' if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, vllm] needs: [changed-files, deploy-operator, vllm]
permissions: permissions:
contents: read contents: read
...@@ -592,7 +580,7 @@ jobs: ...@@ -592,7 +580,7 @@ jobs:
- agg - agg
- agg_router - agg_router
- disagg - disagg
- disagg_router # - disagg_router
name: deploy-test-vllm (${{ matrix.profile }}) name: deploy-test-vllm (${{ matrix.profile }})
env: env:
FRAMEWORK: vllm FRAMEWORK: vllm
...@@ -720,8 +708,6 @@ jobs: ...@@ -720,8 +708,6 @@ jobs:
echo "Test passed: Response matches expected format and content" echo "Test passed: Response matches expected format and content"
fi fi
exit $TEST_RESULT exit $TEST_RESULT
continue-on-error: true
- name: Process Deployment Test Results - name: Process Deployment Test Results
if: always() if: always()
run: | run: |
...@@ -757,16 +743,42 @@ jobs: ...@@ -757,16 +743,42 @@ jobs:
kubectl config set-context --current --namespace=$NAMESPACE kubectl config set-context --current --namespace=$NAMESPACE
# For debugging purposes, list all the resources before we delete # For debugging purposes, list all the resources before we delete
kubectl get dynamographdeployments
kubectl get all kubectl get all
echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..." echo "Deleting DynamoGraphDeployments for this job in namespace $NAMESPACE..."
kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE || true kubectl delete dynamographdeployments ${GRAPH_NAME} -n $NAMESPACE || true
# For now, this job is separated from the job matrix above for easier flow control handling
# Uncomment the disagg_router matrix entry from the above job and delete the below job
# when we want to run them under the same conditions.
# Current conditions:
# - Run vllm disagg_router on all commits
# - Run rest of jobs only on push to main or manual trigger
deploy-test-vllm-disagg-router:
runs-on: cpu-amd-m5-2xlarge
if: needs.changed-files.outputs.has_code_changes == 'true'
needs: [changed-files, deploy-operator, vllm]
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
matrix:
profile:
- disagg_router
name: deploy-test-vllm (${{ matrix.profile }})
env:
FRAMEWORK: vllm
DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com
DEPLOYMENT_FILE: "deploy/${{ matrix.profile }}.yaml"
MODEL_NAME: "Qwen/Qwen3-0.6B"
steps: *deploy-test-steps
deploy-test-sglang: deploy-test-sglang:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-sglang job in CI. # Run if push to main, or manually triggered
#if: needs.changed-files.outputs.has_code_changes == 'true' if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, sglang] needs: [changed-files, deploy-operator, sglang]
permissions: permissions:
contents: read contents: read
...@@ -787,9 +799,8 @@ jobs: ...@@ -787,9 +799,8 @@ jobs:
deploy-test-trtllm: deploy-test-trtllm:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment this when we have a way to test the deploy-test-trtllm job in CI. # Run if push to main, or manually triggered
#if: needs.changed-files.outputs.has_code_changes == 'true' if: ( github.ref_name == 'main' || github.event.inputs.run_deploy_operator )
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, trtllm] needs: [changed-files, deploy-operator, trtllm]
permissions: permissions:
contents: read contents: read
...@@ -812,10 +823,8 @@ jobs: ...@@ -812,10 +823,8 @@ jobs:
cleanup: cleanup:
runs-on: cpu-amd-m5-2xlarge runs-on: cpu-amd-m5-2xlarge
# TODO: Uncomment the below if statement when we have a way to test the cleanup job in CI. if: always()
# if: always() needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm, deploy-test-vllm-disagg-router]
if: github.event.inputs.run_deploy_operator
needs: [changed-files, deploy-operator, deploy-test-trtllm, deploy-test-sglang, deploy-test-vllm]
steps: steps:
- name: Output Node Name - name: Output Node Name
shell: bash shell: bash
...@@ -848,6 +857,7 @@ jobs: ...@@ -848,6 +857,7 @@ jobs:
kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}"
# For debugging purposes, list all the resources before we uninstall # For debugging purposes, list all the resources before we uninstall
kubectl get dynamographdeployments
kubectl get all kubectl get all
echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..." echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment