Unverified Commit e325dd13 authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: OPS-3359: Refactor post-merge and nightly workflows (#6388)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 21fce9ba
...@@ -22,21 +22,63 @@ on: ...@@ -22,21 +22,63 @@ on:
description: 'CUDA versions to build (JSON array, e.g., ["12.9", "13.0"])' description: 'CUDA versions to build (JSON array, e.g., ["12.9", "13.0"])'
required: true required: true
type: string type: string
run_tests: build_timeout_minutes:
description: 'Whether to run pytest' description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
run_cpu_only_tests:
description: 'Whether to run CPU-only tests'
required: false
type: boolean
default: true
cpu_only_test_markers:
description: 'CPU-only pytest markers'
required: false
type: string
cpu_only_test_timeout_minutes:
description: 'Timeout in minutes for CPU tests'
required: false
type: number
default: 10
run_single_gpu_tests:
description: 'Whether to run single GPU tests'
required: false required: false
type: boolean type: boolean
default: true default: true
single_gpu_test_markers:
description: 'Single GPU pytest markers'
required: false
type: string
single_gpu_test_timeout_minutes:
description: 'Timeout in minutes for single GPU tests'
required: false
type: number
default: 30
run_multi_gpu_tests: run_multi_gpu_tests:
description: 'Whether to run multi-gpu tests' description: 'Whether to run multi-gpu tests'
required: false required: false
type: boolean type: boolean
default: false default: true
multi_gpu_test_markers:
description: 'Multi GPU pytest markers'
required: false
type: string
multi_gpu_test_timeout_minutes:
description: 'Timeout in minutes for multi GPU tests'
required: false
type: number
default: 30
copy_to_acr: copy_to_acr:
description: 'Whether to copy images to ACR' description: 'Whether to copy images to ACR'
required: false required: false
type: boolean type: boolean
default: true default: true
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 10
builder_name: builder_name:
description: 'Buildkit builder name' description: 'Buildkit builder name'
required: true required: true
...@@ -61,26 +103,6 @@ on: ...@@ -61,26 +103,6 @@ on:
required: false required: false
type: boolean type: boolean
default: true default: true
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
test_gpu_timeout_minutes:
description: 'Timeout in minutes for the GPU test step'
required: false
type: number
default: 30
test_cpu_timeout_minutes:
description: 'Timeout in minutes for the CPU test step'
required: false
type: number
default: 10
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 10
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -121,12 +143,17 @@ jobs: ...@@ -121,12 +143,17 @@ jobs:
no_cache: ${{ inputs.no_cache }} no_cache: ${{ inputs.no_cache }}
builder_name: ${{ inputs.builder_name }} builder_name: ${{ inputs.builder_name }}
build_image: ${{ inputs.build_image }} build_image: ${{ inputs.build_image }}
run_tests: ${{ inputs.run_tests }} build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
push_image: ${{ inputs.push_image }}
run_cpu_only_tests: ${{ inputs.run_cpu_only_tests }}
cpu_only_test_markers: ${{ inputs.cpu_only_test_markers }}
cpu_only_test_timeout_minutes: ${{ inputs.cpu_only_test_timeout_minutes }}
run_single_gpu_tests: ${{ inputs.run_single_gpu_tests }}
single_gpu_test_markers: ${{ inputs.single_gpu_test_markers }}
single_gpu_test_timeout_minutes: ${{ inputs.single_gpu_test_timeout_minutes }}
run_multi_gpu_tests: ${{ inputs.run_multi_gpu_tests }} run_multi_gpu_tests: ${{ inputs.run_multi_gpu_tests }}
multi_gpu_test_markers: ${{ inputs.multi_gpu_test_markers }}
multi_gpu_test_timeout_minutes: ${{ inputs.multi_gpu_test_timeout_minutes }}
copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR copy_to_acr: ${{ inputs.copy_to_acr && matrix.platform == 'amd64' }} # no reason to copy ARM images to ACR
push_image: ${{ inputs.push_image }}
build_timeout_minutes: ${{ inputs.build_timeout_minutes }}
test_gpu_timeout_minutes: ${{ inputs.test_gpu_timeout_minutes }}
test_cpu_timeout_minutes: ${{ inputs.test_cpu_timeout_minutes }}
copy_timeout_minutes: ${{ inputs.copy_timeout_minutes }} copy_timeout_minutes: ${{ inputs.copy_timeout_minutes }}
secrets: inherit secrets: inherit
...@@ -22,21 +22,63 @@ on: ...@@ -22,21 +22,63 @@ on:
description: 'CUDA version to build (e.g., 12.9, 13.0)' description: 'CUDA version to build (e.g., 12.9, 13.0)'
required: true required: true
type: string type: string
run_tests: build_timeout_minutes:
description: 'Whether to run pytest' description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
run_cpu_only_tests:
description: 'Whether to run CPU-only tests'
required: false
type: boolean
default: true
cpu_only_test_markers:
description: 'CPU-only pytest markers'
required: false
type: string
cpu_only_test_timeout_minutes:
description: 'Timeout in minutes for CPU tests'
required: false
type: number
default: 10
run_single_gpu_tests:
description: 'Whether to run single GPU tests'
required: false required: false
type: boolean type: boolean
default: true default: true
single_gpu_test_markers:
description: 'Single GPU pytest markers'
required: false
type: string
single_gpu_test_timeout_minutes:
description: 'Timeout in minutes for single GPU tests'
required: false
type: number
default: 30
run_multi_gpu_tests: run_multi_gpu_tests:
description: 'Whether to run multi-gpu tests' description: 'Whether to run multi-gpu tests'
required: false required: false
type: boolean type: boolean
default: false default: true
multi_gpu_test_markers:
description: 'Multi GPU pytest markers'
required: false
type: string
multi_gpu_test_timeout_minutes:
description: 'Timeout in minutes for multi GPU tests'
required: false
type: number
default: 30
copy_to_acr: copy_to_acr:
description: 'Whether to copy images to ACR' description: 'Whether to copy images to ACR'
required: false required: false
type: boolean type: boolean
default: true default: true
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 10
builder_name: builder_name:
description: 'Buildkit builder name' description: 'Buildkit builder name'
required: true required: true
...@@ -71,26 +113,6 @@ on: ...@@ -71,26 +113,6 @@ on:
required: false required: false
type: boolean type: boolean
default: false default: false
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
test_gpu_timeout_minutes:
description: 'Timeout in minutes for the GPU test step'
required: false
type: number
default: 30
test_cpu_timeout_minutes:
description: 'Timeout in minutes for the CPU test step'
required: false
type: number
default: 10
copy_timeout_minutes:
description: 'Timeout in minutes for the copy to ACR step'
required: false
type: number
default: 10
secrets: secrets:
AWS_DEFAULT_REGION: AWS_DEFAULT_REGION:
required: true required: true
...@@ -239,7 +261,7 @@ jobs: ...@@ -239,7 +261,7 @@ jobs:
# TEST # TEST
# ============================================================================ # ============================================================================
test: test:
if: inputs.run_tests && inputs.build_image if: ( inputs.run_cpu_only_tests || inputs.run_single_gpu_tests ) && inputs.build_image
needs: [build] needs: [build]
name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }} name: Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }} runs-on: ${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }}
...@@ -297,11 +319,12 @@ jobs: ...@@ -297,11 +319,12 @@ jobs:
# Run CPU-only tests first (parallelized for speed) # Run CPU-only tests first (parallelized for speed)
# These are unit tests marked with gpu_0 that don't require GPU hardware # These are unit tests marked with gpu_0 that don't require GPU hardware
- name: Run CPU-only tests (parallelized) - name: Run CPU-only tests (parallelized)
timeout-minutes: ${{ inputs.test_cpu_timeout_minutes }} if: ${{ inputs.run_cpu_only_tests }}
timeout-minutes: ${{ inputs.cpu_only_test_timeout_minutes }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_0', inputs.framework) }} pytest_marks: ${{ inputs.cpu_only_test_markers }}
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
test_type: "pre_merge_cpu" test_type: "pre_merge_cpu"
platform_arch: ${{ inputs.platform }} platform_arch: ${{ inputs.platform }}
...@@ -313,12 +336,12 @@ jobs: ...@@ -313,12 +336,12 @@ jobs:
# Run GPU tests sequentially (only on amd64 runners with GPU) # Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_1 that require GPU hardware # These are e2e tests marked with gpu_1 that require GPU hardware
- name: Run GPU tests (sequential) - name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.test_gpu_timeout_minutes }} timeout-minutes: ${{ inputs.single_gpu_test_timeout_minutes }}
if: ${{ inputs.platform == 'amd64' }} # We only run GPU tests on amd64 if: ( inputs.platform == 'amd64' && inputs.run_single_gpu_tests == true ) # We only run GPU tests on amd64
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ format('pre_merge and {0} and gpu_1', inputs.framework) }} pytest_marks: ${{ inputs.single_gpu_test_markers }}
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu" test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }} platform_arch: ${{ inputs.platform }}
...@@ -374,10 +397,11 @@ jobs: ...@@ -374,10 +397,11 @@ jobs:
# Run GPU tests sequentially (only on amd64 runners with GPU) # Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware # These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
- name: Run GPU tests (sequential) - name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.multi_gpu_test_timeout_minutes }}
uses: ./.github/actions/pytest uses: ./.github/actions/pytest
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: '(gpu_2 or gpu_4) and pre_merge' pytest_marks: ${{ inputs.multi_gpu_test_markers }}
framework: ${{ inputs.framework }} framework: ${{ inputs.framework }}
test_type: "pre_merge_gpu" test_type: "pre_merge_gpu"
platform_arch: ${{ inputs.platform }} platform_arch: ${{ inputs.platform }}
...@@ -386,6 +410,7 @@ jobs: ...@@ -386,6 +410,7 @@ jobs:
parallel_mode: 'none' parallel_mode: 'none'
dind_as_sidecar: 'true' dind_as_sidecar: 'true'
# ============================================================================ # ============================================================================
# COPY TO ACR # COPY TO ACR
# ============================================================================ # ============================================================================
......
...@@ -1175,4 +1175,3 @@ jobs: ...@@ -1175,4 +1175,3 @@ jobs:
echo "Warning: Failed to send Slack notification" echo "Warning: Failed to send Slack notification"
exit 1 exit 1
fi fi
...@@ -12,27 +12,118 @@ permissions: ...@@ -12,27 +12,118 @@ permissions:
contents: read contents: read
jobs: jobs:
ci-pipeline: # ============================================================================
name: Nightly CI # FRAMEWORK PIPELINES (Build → Test → Copy)
uses: ./.github/workflows/ci-test-suite.yml # ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with: with:
pipeline_type: nightly framework: vllm
include_nightly_marks: true target: runtime
image_prefix: nightly platforms: '["amd64", "arm64"]'
enable_slack_notification: true cuda_versions: '["12.9", "13.0"]'
secrets: extra_tags: |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
NGC_CI_ACCESS_TOKEN: ${{ secrets.NGC_CI_ACCESS_TOKEN }} cpu_only_test_markers: 'nightly and vllm and gpu_0'
CI_TOKEN: ${{ secrets.CI_TOKEN }} single_gpu_test_markers: 'nightly and vllm and gpu_1'
SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }} single_gpu_test_timeout_minutes: 35
AZURE_ACR_HOSTNAME: ${{ secrets.AZURE_ACR_HOSTNAME }} multi_gpu_test_markers: 'nightly and vllm and (gpu_2 or gpu_4)'
AZURE_ACR_USER: ${{ secrets.AZURE_ACR_USER }} secrets: inherit
AZURE_ACR_PASSWORD: ${{ secrets.AZURE_ACR_PASSWORD }}
SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} # ============================================================================
SLACK_OPS_SUPPORT_GROUP_ID: ${{ secrets.SLACK_OPS_SUPPORT_GROUP_ID }} # SGLANG PIPELINE
AZURE_AKS_CI_KUBECONFIG_B64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} # ============================================================================
HF_TOKEN: ${{ secrets.HF_TOKEN }} sglang-pipeline:
DYNAMO_INGRESS_SUFFIX: ${{ secrets.DYNAMO_INGRESS_SUFFIX }} uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers: 'nightly and sglang and gpu_0'
single_gpu_test_markers: 'nightly and sglang and gpu_1'
multi_gpu_test_markers: 'nightly and sglang and (gpu_2 or gpu_4)'
secrets: inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers: 'nightly and trtllm and gpu_0'
single_gpu_test_markers: 'nightly and trtllm and gpu_1'
multi_gpu_test_markers: 'nightly and trtllm and (gpu_2 or gpu_4)'
secrets: inherit
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-builder-amd-v1
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ]
permissions:
contents: read
steps:
- name: Get Failed jobs
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | ":failed: " + (.name | split(" / ") | .[-1]) + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
- name: Notify Slack
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
with:
webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type: incoming-webhook
payload: |
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Nightly Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."
...@@ -12,27 +12,126 @@ permissions: ...@@ -12,27 +12,126 @@ permissions:
contents: read contents: read
jobs: jobs:
ci-pipeline: # ============================================================================
name: Post-Merge CI # FRAMEWORK PIPELINES (Build → Test → Copy)
uses: ./.github/workflows/ci-test-suite.yml # ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with: with:
pipeline_type: post_merge framework: vllm
include_nightly_marks: false target: runtime
image_prefix: main platforms: '["amd64", "arm64"]'
enable_slack_notification: true cuda_versions: '["12.9", "13.0"]'
secrets: extra_tags: |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
NGC_CI_ACCESS_TOKEN: ${{ secrets.NGC_CI_ACCESS_TOKEN }} cpu_only_test_markers: '(pre_merge or post_merge) and vllm and gpu_0'
CI_TOKEN: ${{ secrets.CI_TOKEN }} single_gpu_test_markers: '(pre_merge or post_merge) and vllm and gpu_1'
SCCACHE_S3_BUCKET: ${{ secrets.SCCACHE_S3_BUCKET }} multi_gpu_test_markers: '(pre_merge or post_merge) and vllm and (gpu_2 or gpu_4)'
AZURE_ACR_HOSTNAME: ${{ secrets.AZURE_ACR_HOSTNAME }} cpu_only_test_timeout_minutes: 60
AZURE_ACR_USER: ${{ secrets.AZURE_ACR_USER }} single_gpu_test_timeout_minutes: 60
AZURE_ACR_PASSWORD: ${{ secrets.AZURE_ACR_PASSWORD }} multi_gpu_test_timeout_minutes: 60
SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }} secrets: inherit
SLACK_OPS_SUPPORT_GROUP_ID: ${{ secrets.SLACK_OPS_SUPPORT_GROUP_ID }}
AZURE_AKS_CI_KUBECONFIG_B64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} # ============================================================================
HF_TOKEN: ${{ secrets.HF_TOKEN }} # SGLANG PIPELINE
DYNAMO_INGRESS_SUFFIX: ${{ secrets.DYNAMO_INGRESS_SUFFIX }} # ============================================================================
sglang-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: sglang
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["12.9", "13.0"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers: '(pre_merge or post_merge) and sglang and gpu_0'
single_gpu_test_markers: '(pre_merge or post_merge) and sglang and gpu_1'
multi_gpu_test_markers: '(pre_merge or post_merge) and sglang and (gpu_2 or gpu_4)'
cpu_only_test_timeout_minutes: 60
single_gpu_test_timeout_minutes: 60
multi_gpu_test_timeout_minutes: 60
secrets: inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline:
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml
with:
framework: trtllm
target: runtime
platforms: '["amd64", "arm64"]'
cuda_versions: '["13.1"]'
extra_tags: |
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers: '(pre_merge or post_merge) and trtllm and gpu_0'
single_gpu_test_markers: '(pre_merge or post_merge) and trtllm and gpu_1'
multi_gpu_test_markers: '(pre_merge or post_merge) and trtllm and (gpu_2 or gpu_4)'
cpu_only_test_timeout_minutes: 60
single_gpu_test_timeout_minutes: 60
multi_gpu_test_timeout_minutes: 60
secrets: inherit
############################## SLACK NOTIFICATION ##############################
notify-slack:
name: Notify Slack
runs-on: prod-builder-amd-v1
if: always() && failure()
needs: [ vllm-pipeline, sglang-pipeline, trtllm-pipeline ]
permissions:
contents: read
steps:
- name: Get Failed jobs
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | ":failed: " + (.name | split(" / ") | .[-1]) + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
- name: Notify Slack
uses: slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a #v2.1.1
with:
webhook: ${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type: incoming-webhook
payload: |
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Post-merge Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."
...@@ -194,10 +194,12 @@ jobs: ...@@ -194,10 +194,12 @@ jobs:
${{ github.ref_name == 'main' && 'main-vllm' || '' }} ${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
test_gpu_timeout_minutes: 35
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers: 'pre_merge and vllm and gpu_0'
single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
single_gpu_test_timeout_minutes: 35
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets: inherit secrets: inherit
# ============================================================================ # ============================================================================
...@@ -216,9 +218,11 @@ jobs: ...@@ -216,9 +218,11 @@ jobs:
${{ github.ref_name == 'main' && 'main-sglang' || '' }} ${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers: 'pre_merge and sglang and gpu_0'
single_gpu_test_markers: 'pre_merge and sglang and gpu_1'
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets: inherit secrets: inherit
# ============================================================================ # ============================================================================
...@@ -237,9 +241,11 @@ jobs: ...@@ -237,9 +241,11 @@ jobs:
${{ github.ref_name == 'main' && 'main-trtllm' || '' }} ${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }} ${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }} build_timeout_minutes: ${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }} copy_timeout_minutes: ${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers: 'pre_merge and trtllm and gpu_0'
single_gpu_test_markers: 'pre_merge and trtllm and gpu_1'
run_multi_gpu_tests: false # TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets: inherit secrets: inherit
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment