Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
e325dd13
Unverified
Commit
e325dd13
authored
Feb 25, 2026
by
Dillon Cullinan
Committed by
GitHub
Feb 25, 2026
Browse files
ci: OPS-3359: Refactor post-merge and nightly workflows (#6388)
Signed-off-by:
Dillon Cullinan
<
dcullinan@nvidia.com
>
parent
21fce9ba
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
356 additions
and
109 deletions
+356
-109
.github/workflows/build-test-distribute-flavor-matrix.yml
.github/workflows/build-test-distribute-flavor-matrix.yml
+55
-28
.github/workflows/build-test-distribute-flavor.yml
.github/workflows/build-test-distribute-flavor.yml
+55
-30
.github/workflows/ci-test-suite.yml
.github/workflows/ci-test-suite.yml
+0
-1
.github/workflows/nightly-ci.yml
.github/workflows/nightly-ci.yml
+114
-23
.github/workflows/post-merge-ci.yml
.github/workflows/post-merge-ci.yml
+122
-23
.github/workflows/pr.yaml
.github/workflows/pr.yaml
+10
-4
No files found.
.github/workflows/build-test-distribute-flavor-matrix.yml
View file @
e325dd13
...
@@ -22,21 +22,63 @@ on:
...
@@ -22,21 +22,63 @@ on:
description
:
'
CUDA
versions
to
build
(JSON
array,
e.g.,
["12.9",
"13.0"])'
description
:
'
CUDA
versions
to
build
(JSON
array,
e.g.,
["12.9",
"13.0"])'
required
:
true
required
:
true
type
:
string
type
:
string
run_tests
:
build_timeout_minutes
:
description
:
'
Whether
to
run
pytest'
description
:
'
Timeout
in
minutes
for
the
build
step'
required
:
false
type
:
number
default
:
60
run_cpu_only_tests
:
description
:
'
Whether
to
run
CPU-only
tests'
required
:
false
type
:
boolean
default
:
true
cpu_only_test_markers
:
description
:
'
CPU-only
pytest
markers'
required
:
false
type
:
string
cpu_only_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
CPU
tests'
required
:
false
type
:
number
default
:
10
run_single_gpu_tests
:
description
:
'
Whether
to
run
single
GPU
tests'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
true
default
:
true
single_gpu_test_markers
:
description
:
'
Single
GPU
pytest
markers'
required
:
false
type
:
string
single_gpu_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
single
GPU
tests'
required
:
false
type
:
number
default
:
30
run_multi_gpu_tests
:
run_multi_gpu_tests
:
description
:
'
Whether
to
run
multi-gpu
tests'
description
:
'
Whether
to
run
multi-gpu
tests'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
false
default
:
true
multi_gpu_test_markers
:
description
:
'
Multi
GPU
pytest
markers'
required
:
false
type
:
string
multi_gpu_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
multi
GPU
tests'
required
:
false
type
:
number
default
:
30
copy_to_acr
:
copy_to_acr
:
description
:
'
Whether
to
copy
images
to
ACR'
description
:
'
Whether
to
copy
images
to
ACR'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
true
default
:
true
copy_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
copy
to
ACR
step'
required
:
false
type
:
number
default
:
10
builder_name
:
builder_name
:
description
:
'
Buildkit
builder
name'
description
:
'
Buildkit
builder
name'
required
:
true
required
:
true
...
@@ -61,26 +103,6 @@ on:
...
@@ -61,26 +103,6 @@ on:
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
true
default
:
true
build_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
build
step'
required
:
false
type
:
number
default
:
60
test_gpu_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
GPU
test
step'
required
:
false
type
:
number
default
:
30
test_cpu_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
CPU
test
step'
required
:
false
type
:
number
default
:
10
copy_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
copy
to
ACR
step'
required
:
false
type
:
number
default
:
10
secrets
:
secrets
:
AWS_DEFAULT_REGION
:
AWS_DEFAULT_REGION
:
required
:
true
required
:
true
...
@@ -121,12 +143,17 @@ jobs:
...
@@ -121,12 +143,17 @@ jobs:
no_cache
:
${{ inputs.no_cache }}
no_cache
:
${{ inputs.no_cache }}
builder_name
:
${{ inputs.builder_name }}
builder_name
:
${{ inputs.builder_name }}
build_image
:
${{ inputs.build_image }}
build_image
:
${{ inputs.build_image }}
run_tests
:
${{ inputs.run_tests }}
build_timeout_minutes
:
${{ inputs.build_timeout_minutes }}
push_image
:
${{ inputs.push_image }}
run_cpu_only_tests
:
${{ inputs.run_cpu_only_tests }}
cpu_only_test_markers
:
${{ inputs.cpu_only_test_markers }}
cpu_only_test_timeout_minutes
:
${{ inputs.cpu_only_test_timeout_minutes }}
run_single_gpu_tests
:
${{ inputs.run_single_gpu_tests }}
single_gpu_test_markers
:
${{ inputs.single_gpu_test_markers }}
single_gpu_test_timeout_minutes
:
${{ inputs.single_gpu_test_timeout_minutes }}
run_multi_gpu_tests
:
${{ inputs.run_multi_gpu_tests }}
run_multi_gpu_tests
:
${{ inputs.run_multi_gpu_tests }}
multi_gpu_test_markers
:
${{ inputs.multi_gpu_test_markers }}
multi_gpu_test_timeout_minutes
:
${{ inputs.multi_gpu_test_timeout_minutes }}
copy_to_acr
:
${{ inputs.copy_to_acr && matrix.platform == 'amd64' }}
# no reason to copy ARM images to ACR
copy_to_acr
:
${{ inputs.copy_to_acr && matrix.platform == 'amd64' }}
# no reason to copy ARM images to ACR
push_image
:
${{ inputs.push_image }}
build_timeout_minutes
:
${{ inputs.build_timeout_minutes }}
test_gpu_timeout_minutes
:
${{ inputs.test_gpu_timeout_minutes }}
test_cpu_timeout_minutes
:
${{ inputs.test_cpu_timeout_minutes }}
copy_timeout_minutes
:
${{ inputs.copy_timeout_minutes }}
copy_timeout_minutes
:
${{ inputs.copy_timeout_minutes }}
secrets
:
inherit
secrets
:
inherit
.github/workflows/build-test-distribute-flavor.yml
View file @
e325dd13
...
@@ -22,21 +22,63 @@ on:
...
@@ -22,21 +22,63 @@ on:
description
:
'
CUDA
version
to
build
(e.g.,
12.9,
13.0)'
description
:
'
CUDA
version
to
build
(e.g.,
12.9,
13.0)'
required
:
true
required
:
true
type
:
string
type
:
string
run_tests
:
build_timeout_minutes
:
description
:
'
Whether
to
run
pytest'
description
:
'
Timeout
in
minutes
for
the
build
step'
required
:
false
type
:
number
default
:
60
run_cpu_only_tests
:
description
:
'
Whether
to
run
CPU-only
tests'
required
:
false
type
:
boolean
default
:
true
cpu_only_test_markers
:
description
:
'
CPU-only
pytest
markers'
required
:
false
type
:
string
cpu_only_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
CPU
tests'
required
:
false
type
:
number
default
:
10
run_single_gpu_tests
:
description
:
'
Whether
to
run
single
GPU
tests'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
true
default
:
true
single_gpu_test_markers
:
description
:
'
Single
GPU
pytest
markers'
required
:
false
type
:
string
single_gpu_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
single
GPU
tests'
required
:
false
type
:
number
default
:
30
run_multi_gpu_tests
:
run_multi_gpu_tests
:
description
:
'
Whether
to
run
multi-gpu
tests'
description
:
'
Whether
to
run
multi-gpu
tests'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
false
default
:
true
multi_gpu_test_markers
:
description
:
'
Multi
GPU
pytest
markers'
required
:
false
type
:
string
multi_gpu_test_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
multi
GPU
tests'
required
:
false
type
:
number
default
:
30
copy_to_acr
:
copy_to_acr
:
description
:
'
Whether
to
copy
images
to
ACR'
description
:
'
Whether
to
copy
images
to
ACR'
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
true
default
:
true
copy_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
copy
to
ACR
step'
required
:
false
type
:
number
default
:
10
builder_name
:
builder_name
:
description
:
'
Buildkit
builder
name'
description
:
'
Buildkit
builder
name'
required
:
true
required
:
true
...
@@ -71,26 +113,6 @@ on:
...
@@ -71,26 +113,6 @@ on:
required
:
false
required
:
false
type
:
boolean
type
:
boolean
default
:
false
default
:
false
build_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
build
step'
required
:
false
type
:
number
default
:
60
test_gpu_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
GPU
test
step'
required
:
false
type
:
number
default
:
30
test_cpu_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
CPU
test
step'
required
:
false
type
:
number
default
:
10
copy_timeout_minutes
:
description
:
'
Timeout
in
minutes
for
the
copy
to
ACR
step'
required
:
false
type
:
number
default
:
10
secrets
:
secrets
:
AWS_DEFAULT_REGION
:
AWS_DEFAULT_REGION
:
required
:
true
required
:
true
...
@@ -239,7 +261,7 @@ jobs:
...
@@ -239,7 +261,7 @@ jobs:
# TEST
# TEST
# ============================================================================
# ============================================================================
test
:
test
:
if
:
inputs.run_
tests
&& inputs.build_image
if
:
(
inputs.run_
cpu_only_tests || inputs.run_single_gpu_tests )
&& inputs.build_image
needs
:
[
build
]
needs
:
[
build
]
name
:
Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
name
:
Test ${{ inputs.framework }}-cuda${{ inputs.cuda_version }}-${{ inputs.platform }}
runs-on
:
${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }}
runs-on
:
${{ inputs.platform == 'amd64' && 'prod-tester-amd-gpu-v1' || 'prod-tester-arm-v1' }}
...
@@ -297,11 +319,12 @@ jobs:
...
@@ -297,11 +319,12 @@ jobs:
# Run CPU-only tests first (parallelized for speed)
# Run CPU-only tests first (parallelized for speed)
# These are unit tests marked with gpu_0 that don't require GPU hardware
# These are unit tests marked with gpu_0 that don't require GPU hardware
-
name
:
Run CPU-only tests (parallelized)
-
name
:
Run CPU-only tests (parallelized)
timeout-minutes
:
${{ inputs.test_cpu_timeout_minutes }}
if
:
${{ inputs.run_cpu_only_tests }}
timeout-minutes
:
${{ inputs.cpu_only_test_timeout_minutes }}
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks
:
${{
format('pre_merge and {0} and gpu_0', inputs.framework)
}}
pytest_marks
:
${{
inputs.cpu_only_test_markers
}}
framework
:
${{ inputs.framework }}
framework
:
${{ inputs.framework }}
test_type
:
"
pre_merge_cpu"
test_type
:
"
pre_merge_cpu"
platform_arch
:
${{ inputs.platform }}
platform_arch
:
${{ inputs.platform }}
...
@@ -313,12 +336,12 @@ jobs:
...
@@ -313,12 +336,12 @@ jobs:
# Run GPU tests sequentially (only on amd64 runners with GPU)
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_1 that require GPU hardware
# These are e2e tests marked with gpu_1 that require GPU hardware
-
name
:
Run GPU tests (sequential)
-
name
:
Run GPU tests (sequential)
timeout-minutes
:
${{ inputs.test
_gpu
_timeout_minutes }}
timeout-minutes
:
${{ inputs.
single_gpu_
test_timeout_minutes }}
if
:
${{
inputs.platform == 'amd64'
}}
# We only run GPU tests on amd64
if
:
(
inputs.platform == 'amd64'
&& inputs.run_single_gpu_tests ==
true
)
# We only run GPU tests on amd64
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks
:
${{
format('pre_merge and {0} and gpu_1', inputs.framework)
}}
pytest_marks
:
${{
inputs.single_gpu_test_markers
}}
framework
:
${{ inputs.framework }}
framework
:
${{ inputs.framework }}
test_type
:
"
pre_merge_gpu"
test_type
:
"
pre_merge_gpu"
platform_arch
:
${{ inputs.platform }}
platform_arch
:
${{ inputs.platform }}
...
@@ -374,10 +397,11 @@ jobs:
...
@@ -374,10 +397,11 @@ jobs:
# Run GPU tests sequentially (only on amd64 runners with GPU)
# Run GPU tests sequentially (only on amd64 runners with GPU)
# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
# These are e2e tests marked with gpu_2 or gpu_4 that require GPU hardware
-
name
:
Run GPU tests (sequential)
-
name
:
Run GPU tests (sequential)
timeout-minutes
:
${{ inputs.multi_gpu_test_timeout_minutes }}
uses
:
./.github/actions/pytest
uses
:
./.github/actions/pytest
with
:
with
:
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
image_tag
:
${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks
:
'
(gpu_2
or
gpu_4)
and
pre_merge'
pytest_marks
:
${{ inputs.multi_gpu_test_markers }}
framework
:
${{ inputs.framework }}
framework
:
${{ inputs.framework }}
test_type
:
"
pre_merge_gpu"
test_type
:
"
pre_merge_gpu"
platform_arch
:
${{ inputs.platform }}
platform_arch
:
${{ inputs.platform }}
...
@@ -386,6 +410,7 @@ jobs:
...
@@ -386,6 +410,7 @@ jobs:
parallel_mode
:
'
none'
parallel_mode
:
'
none'
dind_as_sidecar
:
'
true'
dind_as_sidecar
:
'
true'
# ============================================================================
# ============================================================================
# COPY TO ACR
# COPY TO ACR
# ============================================================================
# ============================================================================
...
...
.github/workflows/ci-test-suite.yml
View file @
e325dd13
...
@@ -1175,4 +1175,3 @@ jobs:
...
@@ -1175,4 +1175,3 @@ jobs:
echo "Warning: Failed to send Slack notification"
echo "Warning: Failed to send Slack notification"
exit 1
exit 1
fi
fi
.github/workflows/nightly-ci.yml
View file @
e325dd13
...
@@ -12,27 +12,118 @@ permissions:
...
@@ -12,27 +12,118 @@ permissions:
contents
:
read
contents
:
read
jobs
:
jobs
:
ci-pipeline
:
# ============================================================================
name
:
Nightly CI
# FRAMEWORK PIPELINES (Build → Test → Copy)
uses
:
./.github/workflows/ci-test-suite.yml
# ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline
:
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
with
:
pipeline_type
:
nightly
framework
:
vllm
include_nightly_marks
:
true
target
:
runtime
image_prefix
:
nightly
platforms
:
'
["amd64",
"arm64"]'
enable_slack_notification
:
true
cuda_versions
:
'
["12.9",
"13.0"]'
secrets
:
extra_tags
:
|
AWS_ACCOUNT_ID
:
${{ secrets.AWS_ACCOUNT_ID }}
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
AWS_DEFAULT_REGION
:
${{ secrets.AWS_DEFAULT_REGION }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
AWS_ACCESS_KEY_ID
:
${{ secrets.AWS_ACCESS_KEY_ID }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
AWS_SECRET_ACCESS_KEY
:
${{ secrets.AWS_SECRET_ACCESS_KEY }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
NGC_CI_ACCESS_TOKEN
:
${{ secrets.NGC_CI_ACCESS_TOKEN }}
cpu_only_test_markers
:
'
nightly
and
vllm
and
gpu_0'
CI_TOKEN
:
${{ secrets.CI_TOKEN }}
single_gpu_test_markers
:
'
nightly
and
vllm
and
gpu_1'
SCCACHE_S3_BUCKET
:
${{ secrets.SCCACHE_S3_BUCKET }}
single_gpu_test_timeout_minutes
:
35
AZURE_ACR_HOSTNAME
:
${{ secrets.AZURE_ACR_HOSTNAME }}
multi_gpu_test_markers
:
'
nightly
and
vllm
and
(gpu_2
or
gpu_4)'
AZURE_ACR_USER
:
${{ secrets.AZURE_ACR_USER }}
secrets
:
inherit
AZURE_ACR_PASSWORD
:
${{ secrets.AZURE_ACR_PASSWORD }}
SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL
:
${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
# ============================================================================
SLACK_OPS_SUPPORT_GROUP_ID
:
${{ secrets.SLACK_OPS_SUPPORT_GROUP_ID }}
# SGLANG PIPELINE
AZURE_AKS_CI_KUBECONFIG_B64
:
${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
# ============================================================================
HF_TOKEN
:
${{ secrets.HF_TOKEN }}
sglang-pipeline
:
DYNAMO_INGRESS_SUFFIX
:
${{ secrets.DYNAMO_INGRESS_SUFFIX }}
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
framework
:
sglang
target
:
runtime
platforms
:
'
["amd64",
"arm64"]'
cuda_versions
:
'
["12.9",
"13.0"]'
extra_tags
:
|
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers
:
'
nightly
and
sglang
and
gpu_0'
single_gpu_test_markers
:
'
nightly
and
sglang
and
gpu_1'
multi_gpu_test_markers
:
'
nightly
and
sglang
and
(gpu_2
or
gpu_4)'
secrets
:
inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline
:
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
framework
:
trtllm
target
:
runtime
platforms
:
'
["amd64",
"arm64"]'
cuda_versions
:
'
["13.1"]'
extra_tags
:
|
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers
:
'
nightly
and
trtllm
and
gpu_0'
single_gpu_test_markers
:
'
nightly
and
trtllm
and
gpu_1'
multi_gpu_test_markers
:
'
nightly
and
trtllm
and
(gpu_2
or
gpu_4)'
secrets
:
inherit
############################## SLACK NOTIFICATION ##############################
notify-slack
:
name
:
Notify Slack
runs-on
:
prod-builder-amd-v1
if
:
always() && failure()
needs
:
[
vllm-pipeline
,
sglang-pipeline
,
trtllm-pipeline
]
permissions
:
contents
:
read
steps
:
-
name
:
Get Failed jobs
shell
:
bash
env
:
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
run
:
|
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | ":failed: " + (.name | split(" / ") | .[-1]) + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
-
name
:
Notify Slack
uses
:
slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a
#v2.1.1
with
:
webhook
:
${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type
:
incoming-webhook
payload
:
|
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Nightly Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."
.github/workflows/post-merge-ci.yml
View file @
e325dd13
...
@@ -12,27 +12,126 @@ permissions:
...
@@ -12,27 +12,126 @@ permissions:
contents
:
read
contents
:
read
jobs
:
jobs
:
ci-pipeline
:
# ============================================================================
name
:
Post-Merge CI
# FRAMEWORK PIPELINES (Build → Test → Copy)
uses
:
./.github/workflows/ci-test-suite.yml
# ============================================================================
# ============================================================================
# VLLM PIPELINE
# ============================================================================
vllm-pipeline
:
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
with
:
pipeline_type
:
post_merge
framework
:
vllm
include_nightly_marks
:
false
target
:
runtime
image_prefix
:
main
platforms
:
'
["amd64",
"arm64"]'
enable_slack_notification
:
true
cuda_versions
:
'
["12.9",
"13.0"]'
secrets
:
extra_tags
:
|
AWS_ACCOUNT_ID
:
${{ secrets.AWS_ACCOUNT_ID }}
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
AWS_DEFAULT_REGION
:
${{ secrets.AWS_DEFAULT_REGION }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
AWS_ACCESS_KEY_ID
:
${{ secrets.AWS_ACCESS_KEY_ID }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
AWS_SECRET_ACCESS_KEY
:
${{ secrets.AWS_SECRET_ACCESS_KEY }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
NGC_CI_ACCESS_TOKEN
:
${{ secrets.NGC_CI_ACCESS_TOKEN }}
cpu_only_test_markers
:
'
(pre_merge
or
post_merge)
and
vllm
and
gpu_0'
CI_TOKEN
:
${{ secrets.CI_TOKEN }}
single_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
vllm
and
gpu_1'
SCCACHE_S3_BUCKET
:
${{ secrets.SCCACHE_S3_BUCKET }}
multi_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
vllm
and
(gpu_2
or
gpu_4)'
AZURE_ACR_HOSTNAME
:
${{ secrets.AZURE_ACR_HOSTNAME }}
cpu_only_test_timeout_minutes
:
60
AZURE_ACR_USER
:
${{ secrets.AZURE_ACR_USER }}
single_gpu_test_timeout_minutes
:
60
AZURE_ACR_PASSWORD
:
${{ secrets.AZURE_ACR_PASSWORD }}
multi_gpu_test_timeout_minutes
:
60
SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL
:
${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
secrets
:
inherit
SLACK_OPS_SUPPORT_GROUP_ID
:
${{ secrets.SLACK_OPS_SUPPORT_GROUP_ID }}
AZURE_AKS_CI_KUBECONFIG_B64
:
${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
# ============================================================================
HF_TOKEN
:
${{ secrets.HF_TOKEN }}
# SGLANG PIPELINE
DYNAMO_INGRESS_SUFFIX
:
${{ secrets.DYNAMO_INGRESS_SUFFIX }}
# ============================================================================
sglang-pipeline
:
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
framework
:
sglang
target
:
runtime
platforms
:
'
["amd64",
"arm64"]'
cuda_versions
:
'
["12.9",
"13.0"]'
extra_tags
:
|
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers
:
'
(pre_merge
or
post_merge)
and
sglang
and
gpu_0'
single_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
sglang
and
gpu_1'
multi_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
sglang
and
(gpu_2
or
gpu_4)'
cpu_only_test_timeout_minutes
:
60
single_gpu_test_timeout_minutes
:
60
multi_gpu_test_timeout_minutes
:
60
secrets
:
inherit
# ============================================================================
# TRTLLM PIPELINE
# ============================================================================
trtllm-pipeline
:
uses
:
./.github/workflows/build-test-distribute-flavor-matrix.yml
with
:
framework
:
trtllm
target
:
runtime
platforms
:
'
["amd64",
"arm64"]'
cuda_versions
:
'
["13.1"]'
extra_tags
:
|
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name
:
b-${{ github.run_id }}-${{ github.run_attempt }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
cpu_only_test_markers
:
'
(pre_merge
or
post_merge)
and
trtllm
and
gpu_0'
single_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
trtllm
and
gpu_1'
multi_gpu_test_markers
:
'
(pre_merge
or
post_merge)
and
trtllm
and
(gpu_2
or
gpu_4)'
cpu_only_test_timeout_minutes
:
60
single_gpu_test_timeout_minutes
:
60
multi_gpu_test_timeout_minutes
:
60
secrets
:
inherit
############################## SLACK NOTIFICATION ##############################
notify-slack
:
name
:
Notify Slack
runs-on
:
prod-builder-amd-v1
if
:
always() && failure()
needs
:
[
vllm-pipeline
,
sglang-pipeline
,
trtllm-pipeline
]
permissions
:
contents
:
read
steps
:
-
name
:
Get Failed jobs
shell
:
bash
env
:
GITHUB_TOKEN
:
${{ secrets.GITHUB_TOKEN }}
run
:
|
JOBS_JSON=$(mktemp)
curl -sSL \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/jobs?per_page=100" \
>$JOBS_JSON
FAILED_JOBS=$(jq -r '.jobs[] | select(.conclusion == "failure") | ":failed: " + (.name | split(" / ") | .[-1]) + "\\n"' "$JOBS_JSON")
echo $FAILED_JOBS
{
echo "FAILED_JOBS<<EOF"
echo "$FAILED_JOBS"
echo "EOF"
} >> "$GITHUB_ENV"
-
name
:
Notify Slack
uses
:
slackapi/slack-github-action@91efab103c0de0a537f72a35f6b8cda0ee76bf0a
#v2.1.1
with
:
webhook
:
${{ secrets.SLACK_NOTIFY_NIGHTLY_WEBHOOK_URL }}
webhook-type
:
incoming-webhook
payload
:
|
blocks:
- type: "section"
text:
type: mrkdwn
text: ":alert: *Github Post-merge Pipeline Failure*"
- type: "section"
text:
type: mrkdwn
text: "<https://github.com/ai-dynamo/dynamo/actions/runs/${{ github.run_id }}|Workflow Summary>"
- type: "section"
text:
type: mrkdwn
text: "${{ env.FAILED_JOBS }}"
- type: "section"
text:
type: mrkdwn
text: "@ops-support Please investigate the failures above."
.github/workflows/pr.yaml
View file @
e325dd13
...
@@ -194,10 +194,12 @@ jobs:
...
@@ -194,10 +194,12 @@ jobs:
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && 'main-vllm' || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
${{ github.ref_name == 'main' && format('main-vllm-{0}', github.sha) || '' }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
test_gpu_timeout_minutes
:
35
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers
:
'
pre_merge
and
vllm
and
gpu_0'
single_gpu_test_markers
:
'
pre_merge
and
vllm
and
gpu_1'
single_gpu_test_timeout_minutes
:
35
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets
:
inherit
secrets
:
inherit
# ============================================================================
# ============================================================================
...
@@ -216,9 +218,11 @@ jobs:
...
@@ -216,9 +218,11 @@ jobs:
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && 'main-sglang' || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
${{ github.ref_name == 'main' && format('main-sglang-{0}', github.sha) || '' }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers
:
'
pre_merge
and
sglang
and
gpu_0'
single_gpu_test_markers
:
'
pre_merge
and
sglang
and
gpu_1'
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets
:
inherit
secrets
:
inherit
# ============================================================================
# ============================================================================
...
@@ -237,9 +241,11 @@ jobs:
...
@@ -237,9 +241,11 @@ jobs:
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && 'main-trtllm' || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
${{ github.ref_name == 'main' && format('main-trtllm-{0}', github.sha) || '' }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
builder_name
:
${{ needs.changed-files.outputs.builder_name }}
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
build_timeout_minutes
:
${{ github.ref_name == 'main' && 120 || 60 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
copy_timeout_minutes
:
${{ github.ref_name == 'main' && 20 || 10 }}
cpu_only_test_markers
:
'
pre_merge
and
trtllm
and
gpu_0'
single_gpu_test_markers
:
'
pre_merge
and
trtllm
and
gpu_1'
run_multi_gpu_tests
:
false
# TODO: Dmitry is working on fixing markers for multi-GPU tests, can enable after that is resolved
secrets
:
inherit
secrets
:
inherit
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment