Unverified Commit 5a319aed authored by Tushar Sharma's avatar Tushar Sharma Committed by GitHub
Browse files

ci: improve deploy test reliability and add deploy-status-check (#6512)


Signed-off-by: default avatarTushar Sharma <tusharma@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 6dbe9f6a
......@@ -70,6 +70,15 @@ jobs:
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
deploy-status-check:
runs-on: ubuntu-latest
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
if: always()
steps:
- name: "Check all deploy test jobs"
run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
# ============================================================================
# Operator
......@@ -371,11 +380,12 @@ jobs:
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
runs-on: prod-default-small-v2
needs: [changed-files, deploy-operator, vllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
max-parallel: 2
matrix:
profile:
- agg
......@@ -405,11 +415,12 @@ jobs:
# Run if core, sglang, or deploy is changed
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
needs: [changed-files, deploy-operator, sglang-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
max-parallel: 2
matrix:
profile:
- agg
......@@ -437,17 +448,20 @@ jobs:
# Run if core, trtllm, or deploy is changed
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
needs: [changed-files, deploy-operator, trtllm-pipeline]
timeout-minutes: 25
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 1
max-parallel: 2
matrix:
profile:
- agg
- agg_router
- disagg
- disagg_router
# Disabled: trtllm disagg profiles consistently timeout (~32 min) with 0% success rate.
# Re-enable once the underlying disagg deployment issue is resolved.
# - disagg
# - disagg_router
name: deploy-test-trtllm (${{ matrix.profile }})
env:
FRAMEWORK: trtllm
......@@ -495,8 +509,10 @@ jobs:
cleanup:
name: Cleanup AKS resources
runs-on: prod-default-small-v2
if: always()
needs: [deploy-operator, deploy-test-sglang, deploy-test-trtllm, deploy-test-vllm]
if: >-
always() &&
needs.deploy-status-check.result == 'success'
needs: [deploy-operator, deploy-status-check]
steps:
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......
......@@ -104,6 +104,7 @@ def validate_chat_response(
@pytest.mark.deploy
@pytest.mark.post_merge
@pytest.mark.e2e
@pytest.mark.timeout(1200)
async def test_deployment(
deployment_target: DeploymentTarget,
deployment_spec: DeploymentSpec,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment