# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 name: Nightly CI Pipeline on: schedule: - cron: '0 8 * * *' # Every day at 12:00 AM PST (08:00 UTC) workflow_dispatch: # Allow manual triggering for testing permissions: contents: read env: BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }} jobs: # ============================================================================ # PRE-WARM K8S BUILDER # ============================================================================ create-fresh-builder: name: Create fresh K8s builder runs-on: prod-default-small-v2 permissions: contents: read outputs: builder_name: ${{ steps.export-builder-name.outputs.builder_name }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Export builder name id: export-builder-name run: | echo "builder_name=${{ env.BUILDER_NAME }}" >> $GITHUB_OUTPUT - name: Create and bootstrap fresh K8s builder uses: ./.github/actions/bootstrap-buildkit with: builder_name: ${{ steps.export-builder-name.outputs.builder_name }} buildkit_worker_addresses: '' suppress_fallback_warning: 'true' # ============================================================================ # FRAMEWORK PIPELINES (Build → Test → Copy) # ============================================================================ # ============================================================================ # VLLM PIPELINE # ============================================================================ vllm-pipeline: uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml needs: [create-fresh-builder] with: framework: vllm target: runtime fresh_builder: true platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} build_timeout_minutes: 180 cpu_only_test_markers: 'vllm and gpu_0' single_gpu_test_markers: 'vllm and gpu_1' single_gpu_test_timeout_minutes: 300 multi_gpu_test_markers: 'vllm and (gpu_2 or gpu_4)' multi_gpu_test_timeout_minutes: 120 secrets: inherit # ============================================================================ # SGLANG PIPELINE # ============================================================================ sglang-pipeline: uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml needs: [create-fresh-builder] with: framework: sglang target: runtime fresh_builder: true platform: 'linux/amd64,linux/arm64' cuda_versions: '["12.9", "13.0"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} build_timeout_minutes: 180 cpu_only_test_markers: 'sglang and gpu_0' # `not skip_in_nightly` excludes tests flagged via the skip_in_nightly # pytest marker (see pyproject.toml) so they only run in pre_merge / # post_merge. DYN-2784 is the first user (test_sglang_indexers_sync). single_gpu_test_markers: 'sglang and gpu_1 and not skip_in_nightly' single_gpu_test_timeout_minutes: 300 multi_gpu_test_markers: 'sglang and (gpu_2 or gpu_4)' multi_gpu_test_timeout_minutes: 120 secrets: inherit # ============================================================================ # TRTLLM PIPELINE # ============================================================================ trtllm-pipeline: uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml needs: [create-fresh-builder] with: framework: trtllm target: runtime fresh_builder: true platform: 'linux/amd64,linux/arm64' cuda_versions: '["13.1"]' extra_tags: | ${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} build_timeout_minutes: 180 cpu_only_test_markers: 'trtllm and gpu_0' single_gpu_test_markers: 'trtllm and gpu_1' single_gpu_test_timeout_minutes: 300 multi_gpu_test_markers: 'trtllm and (gpu_2 or gpu_4)' multi_gpu_test_timeout_minutes: 120 secrets: inherit # ============================================================================ # DYNAMO RUNTIME PIPELINE # ============================================================================ dynamo-pipeline: name: dynamo-runtime needs: [create-fresh-builder] uses: ./.github/workflows/dynamo-pipeline.yml with: builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} fresh_builder: true no_cache: true build_timeout_minutes: 90 # TODO: widen beyond `pre_merge` — today it picks up tests # (e.g. fault_tolerance/deploy/*) that fail in this container-only # context. Matches the coverage of the old container-validation-dynamo # workflow. cpu_parallel_test_markers: 'pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)' cpu_sequential_test_markers: 'pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)' gpu_test_markers: 'pre_merge and none and gpu_1' secrets: inherit # ============================================================================ # CLEANUP # ============================================================================ clean-k8s-builder: name: Clean K8s builder if exists runs-on: prod-default-small-v2 if: always() needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline, create-fresh-builder] permissions: contents: read steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Register K8s builder context (skip bootstrap) uses: ./.github/actions/bootstrap-buildkit continue-on-error: true with: builder_name: ${{ needs.create-fresh-builder.outputs.builder_name }} buildkit_worker_addresses: '' skip_bootstrap: 'true' - name: Remove K8s builder shell: bash run: | docker buildx rm ${{ env.BUILDER_NAME }} || true ############################## SLACK NOTIFICATION ############################## notify-slack: name: Notify Slack runs-on: ubuntu-slim if: always() && failure() needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline, dynamo-pipeline ] permissions: contents: read steps: - name: Get Failed jobs shell: bash env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | JOBS_JSON=$(mktemp) curl -sSL \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \ >$JOBS_JSON FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | .name | split(" / ") | if length > 2 then ":failed: " + .[0] + " > " + .[-1] else ":failed: " + .[-1] end | . + "\\n"' "$JOBS_JSON") echo $FAILED_JOBS { echo "FAILED_JOBS<> "$GITHUB_ENV" - name: Notify Slack uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1 with: webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} webhook-type: incoming-webhook payload: | blocks: - type: "section" text: type: mrkdwn text: ":alert: *Github Nightly Pipeline Failure*" - type: "section" text: type: mrkdwn text: "" - type: "section" text: type: mrkdwn text: "${{ env.FAILED_JOBS }}" - type: "section" text: type: mrkdwn text: "@ops-support Please investigate the failures above."