Unverified Commit a0e41273 authored by Dillon Cullinan's avatar Dillon Cullinan Committed by GitHub
Browse files

ci: Phase 1: Shared Workflow Improvements (#7345)


Signed-off-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 7a6db48e
...@@ -8,8 +8,8 @@ inputs: ...@@ -8,8 +8,8 @@ inputs:
image_tag: image_tag:
description: 'Image Tag to run tests on' description: 'Image Tag to run tests on'
required: true required: true
framework: test_suite_name:
description: 'Framework name for test metrics' description: 'Test suite name for test metrics'
required: false required: false
default: 'unknown' default: 'unknown'
test_type: test_type:
...@@ -291,7 +291,7 @@ runs: ...@@ -291,7 +291,7 @@ runs:
echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)" echo "📊 ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)"
# Rename XML file to unique name # Rename XML file to unique name
JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml" JUNIT_NAME="pytest_test_report_${{ inputs.test_suite_name }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml"
mv "$JUNIT_FILE" "test-results/$JUNIT_NAME" mv "$JUNIT_FILE" "test-results/$JUNIT_NAME"
echo "📝 Renamed XML file to: $JUNIT_NAME" echo "📝 Renamed XML file to: $JUNIT_NAME"
else else
...@@ -314,8 +314,8 @@ runs: ...@@ -314,8 +314,8 @@ runs:
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
if: always() # Always upload test results, even if tests failed if: always() # Always upload test results, even if tests failed
with: with:
name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }} name: test-results-${{ inputs.test_suite_name }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }}-${{ github.run_id }}-${{ job.check_run_id }}
path: test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml path: test-results/pytest_test_report_${{ inputs.test_suite_name }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}_${{ github.run_id }}_${{ job.check_run_id }}.xml
retention-days: 7 retention-days: 7
- name: Upload Allure Results - name: Upload Allure Results
......
...@@ -288,7 +288,7 @@ jobs: ...@@ -288,7 +288,7 @@ jobs:
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ inputs.cpu_only_test_markers }} pytest_marks: ${{ inputs.cpu_only_test_markers }}
framework: ${{ inputs.framework }} test_suite_name: ${{ inputs.framework }}
test_type: "pre_merge_cpu" test_type: "pre_merge_cpu"
platform_arch: ${{ matrix.arch }} platform_arch: ${{ matrix.arch }}
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
...@@ -304,7 +304,7 @@ jobs: ...@@ -304,7 +304,7 @@ jobs:
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ inputs.single_gpu_test_markers }} pytest_marks: ${{ inputs.single_gpu_test_markers }}
framework: ${{ inputs.framework }} test_suite_name: ${{ inputs.framework }}
test_type: "pre_merge_gpu" test_type: "pre_merge_gpu"
platform_arch: ${{ matrix.arch }} platform_arch: ${{ matrix.arch }}
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
...@@ -362,7 +362,7 @@ jobs: ...@@ -362,7 +362,7 @@ jobs:
with: with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }} image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ inputs.multi_gpu_test_markers }} pytest_marks: ${{ inputs.multi_gpu_test_markers }}
framework: ${{ inputs.framework }} test_suite_name: ${{ inputs.framework }}
test_type: "pre_merge_gpu" test_type: "pre_merge_gpu"
platform_arch: amd64 platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
......
...@@ -269,7 +269,7 @@ jobs: ...@@ -269,7 +269,7 @@ jobs:
with: with:
image_tag: ${{ env.IMAGE_TAG }} image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)" pytest_marks: "pre_merge and parallel and not (vllm or sglang or trtllm) and (gpu_0)"
framework: dynamo test_suite_name: dynamo
test_type: "pre_merge_parallel" test_type: "pre_merge_parallel"
platform_arch: amd64 platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
...@@ -301,7 +301,7 @@ jobs: ...@@ -301,7 +301,7 @@ jobs:
with: with:
image_tag: ${{ env.IMAGE_TAG }} image_tag: ${{ env.IMAGE_TAG }}
pytest_marks: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)" pytest_marks: "pre_merge and not parallel and not (vllm or sglang or trtllm) and (gpu_0)"
framework: dynamo test_suite_name: dynamo
test_type: "pre_merge_sequential" test_type: "pre_merge_sequential"
platform_arch: amd64 platform_arch: amd64
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
......
...@@ -329,7 +329,7 @@ jobs: ...@@ -329,7 +329,7 @@ jobs:
deploy-test-vllm: deploy-test-vllm:
needs: [deploy-operator, vllm-pipeline] needs: [deploy-operator, vllm-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml uses: ./.github/workflows/shared-deploy-test.yml
with: with:
framework: vllm framework: vllm
profiles: '["agg", "agg_router", "disagg", "disagg_router"]' profiles: '["agg", "agg_router", "disagg", "disagg_router"]'
...@@ -341,7 +341,7 @@ jobs: ...@@ -341,7 +341,7 @@ jobs:
deploy-test-sglang: deploy-test-sglang:
needs: [deploy-operator, sglang-pipeline] needs: [deploy-operator, sglang-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml uses: ./.github/workflows/shared-deploy-test.yml
with: with:
framework: sglang framework: sglang
profiles: '["agg", "agg_router"]' profiles: '["agg", "agg_router"]'
...@@ -353,7 +353,7 @@ jobs: ...@@ -353,7 +353,7 @@ jobs:
deploy-test-trtllm: deploy-test-trtllm:
needs: [deploy-operator, trtllm-pipeline] needs: [deploy-operator, trtllm-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml uses: ./.github/workflows/shared-deploy-test.yml
with: with:
framework: trtllm framework: trtllm
profiles: '["agg", "agg_router"]' profiles: '["agg", "agg_router"]'
......
...@@ -7,14 +7,6 @@ on: ...@@ -7,14 +7,6 @@ on:
push: push:
branches: branches:
- "pull-request/[0-9]+" - "pull-request/[0-9]+"
# Note: release/* branches are handled by release.yml workflow
workflow_dispatch:
inputs:
run_deploy_operator:
description: 'Run deploy operator and deployment tests'
required: false
type: boolean
default: false
concurrency: concurrency:
# The group name is the ref_name, so that workflows on the same PR/branch have the same group name for cancelling. # The group name is the ref_name, so that workflows on the same PR/branch have the same group name for cancelling.
...@@ -25,15 +17,11 @@ env: ...@@ -25,15 +17,11 @@ env:
BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }} BUILDER_NAME: b-${{ github.run_id }}-${{ github.run_attempt }}
jobs: jobs:
# ============================================================================ # ============================================================================
# SETUP & DETECTION JOBS # SETUP & DETECTION JOBS
# ============================================================================ # ============================================================================
changed-files: changed-files:
runs-on: ubuntu-latest runs-on: ubuntu-latest
environment: ${{ github.event_name == 'workflow_dispatch' && 'protected-deploy' || '' }}
outputs: outputs:
core: ${{ steps.changes.outputs.core }} core: ${{ steps.changes.outputs.core }}
planner: ${{ steps.changes.outputs.planner }} planner: ${{ steps.changes.outputs.planner }}
...@@ -60,10 +48,29 @@ jobs: ...@@ -60,10 +48,29 @@ jobs:
backend-status-check: backend-status-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [changed-files, planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator] # THIS list determines blocking jobs needs:
- changed-files
- operator
- vllm-build
- vllm-dev-build
- vllm-test
- vllm-multi-gpu-test
- vllm-copy-to-acr
- sglang-build
- sglang-dev-build
- sglang-test
- sglang-multi-gpu-test
- sglang-copy-to-acr
- trtllm-build
- trtllm-dev-build
- trtllm-test
- trtllm-multi-gpu-test
- trtllm-copy-to-acr
- planner-build
- planner-test
if: always() if: always()
steps: steps:
- name: "Check all dependent jobs" - name: Check all dependent jobs
run: | run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))'
...@@ -72,15 +79,10 @@ jobs: ...@@ -72,15 +79,10 @@ jobs:
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
if: always() if: always()
steps: steps:
- name: "Check all deploy test jobs" - name: Check all deploy test jobs
run: | run: |
echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))' echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped", "cancelled"] | any($result == .))'
# ============================================================================
# Operator
# ============================================================================
operator: operator:
needs: changed-files needs: changed-files
if: needs.changed-files.outputs.operator == 'true' if: needs.changed-files.outputs.operator == 'true'
...@@ -174,162 +176,348 @@ jobs: ...@@ -174,162 +176,348 @@ jobs:
echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY echo "| \`${image_uri}\` |" >> $GITHUB_STEP_SUMMARY
done done
# ============================================================================
# FRAMEWORK PIPELINES (Build → Test → Copy)
# ============================================================================
# ============================================================================ # ============================================================================
# PLANNER PIPELINE # BUILD PIPELINES
# ============================================================================ # ============================================================================
planner-pipeline:
name: planner
needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true'
uses: ./.github/workflows/build-test-distribute-flavor.yml
with:
framework: dynamo
builder_flavor: general
target: planner
platform: 'linux/amd64'
cpu_only: true
builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 45
run_cpu_only_tests: true
cpu_only_test_markers: 'pre_merge and planner and gpu_0'
cpu_only_test_timeout_minutes: 30
run_single_gpu_tests: false
run_multi_gpu_tests: false
copy_to_acr: false
secrets: inherit
# ============================================================================ vllm-build:
# VLLM PIPELINE name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
# ============================================================================
vllm-pipeline:
name: vllm
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: vllm framework: vllm
target: runtime target: runtime
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 60 build_timeout_minutes: 60
copy_timeout_minutes: 10
run_cpu_only_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
cpu_only_test_markers: 'pre_merge and vllm and gpu_0'
run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
single_gpu_test_markers: 'pre_merge and vllm and gpu_1'
single_gpu_test_timeout_minutes: 35
run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' }}
multi_gpu_test_markers: 'pre_merge and vllm and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit secrets: inherit
# ============================================================================ vllm-dev-build:
# SGLANG PIPELINE name: vllm-dev
# ============================================================================
sglang-pipeline:
name: sglang
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: sglang framework: vllm
target: runtime target: dev
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
push_image: false
build_timeout_minutes: 60 build_timeout_minutes: 60
copy_timeout_minutes: 10
run_cpu_only_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' }}
cpu_only_test_markers: 'pre_merge and sglang and gpu_0'
run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' }}
single_gpu_test_markers: 'pre_merge and sglang and gpu_1'
run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' }}
multi_gpu_test_markers: 'pre_merge and sglang and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit secrets: inherit
# ============================================================================ sglang-build:
# TRTLLM PIPELINE name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
# ============================================================================
trtllm-pipeline:
name: trtllm
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/build-test-distribute-flavor-matrix.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: trtllm framework: sglang
target: runtime target: runtime
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["13.1"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 60 build_timeout_minutes: 60
copy_timeout_minutes: 10
run_cpu_only_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' }}
cpu_only_test_markers: 'pre_merge and trtllm and gpu_0'
run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' }}
single_gpu_test_markers: 'pre_merge and trtllm and gpu_1'
run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' }}
multi_gpu_test_markers: 'pre_merge and trtllm and (gpu_2 or gpu_4)'
multi_gpu_test_timeout_minutes: 60
secrets: inherit secrets: inherit
# ============================================================================ sglang-dev-build:
# DEV PIPELINES name: sglang-dev
# ============================================================================
vllm-dev-pipeline:
name: vllm-dev
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
uses: ./.github/workflows/build-flavor.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: vllm framework: sglang
target: dev target: dev
cuda_version: '["12.9", "13.0"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 60
push_image: false push_image: false
build_timeout_minutes: 60
secrets: inherit secrets: inherit
sglang-dev-pipeline: trtllm-build:
name: sglang-dev name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/build-flavor.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: sglang framework: trtllm
target: dev target: runtime
cuda_version: '["13.1"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["12.9", "13.0"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 60 build_timeout_minutes: 60
push_image: false
secrets: inherit secrets: inherit
trtllm-dev-pipeline: trtllm-dev-build:
name: trtllm-dev name: trtllm-dev
needs: [changed-files] needs: [changed-files]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true'
uses: ./.github/workflows/build-flavor.yml uses: ./.github/workflows/shared-build-image.yml
with: with:
framework: trtllm framework: trtllm
target: dev target: dev
cuda_version: '["13.1"]'
platform: 'linux/amd64,linux/arm64' platform: 'linux/amd64,linux/arm64'
cuda_versions: '["13.1"]'
builder_name: ${{ needs.changed-files.outputs.builder_name }} builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 60
push_image: false push_image: false
build_timeout_minutes: 60
secrets: inherit secrets: inherit
# ============================================================================ planner-build:
# DEPLOYMENT JOBS name: planner # This name overlaps with other planner jobs to group them in the UI
# Deploy operator and run end-to-end tests on Kubernetes cluster needs: [changed-files]
# ============================================================================ if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true'
uses: ./.github/workflows/shared-build-image.yml
with:
framework: dynamo
target: planner
cuda_version: '[""]'
platform: 'linux/amd64'
builder_name: ${{ needs.changed-files.outputs.builder_name }}
build_timeout_minutes: 45
secrets: inherit
# ============================================================================
# TEST PIPELINES
# ============================================================================
vllm-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [changed-files, vllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: vllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v1 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
run_cpu_only_tests: true
cpu_only_test_markers: pre_merge and vllm and gpu_0
gpu_test_markers: pre_merge and vllm and gpu_1
gpu_test_timeout_minutes: 35
secrets: inherit
vllm-multi-gpu-test:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [changed-files, vllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: vllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v1
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]' # No ARM GPUs available
run_sanity_check: false
gpu_test_markers: pre_merge and vllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 60
secrets: inherit
sglang-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [changed-files, sglang-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: sglang
test_type: Test
amd_runner: prod-tester-amd-gpu-v1 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
run_cpu_only_tests: true
cpu_only_test_markers: pre_merge and sglang and gpu_0
gpu_test_markers: pre_merge and sglang and gpu_1
secrets: inherit
sglang-multi-gpu-test:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [changed-files, sglang-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: sglang
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v1
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]' # No ARM GPUs available
run_sanity_check: false
gpu_test_markers: pre_merge and sglang and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 60
secrets: inherit
trtllm-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [changed-files, trtllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: trtllm
test_type: Test
amd_runner: prod-tester-amd-gpu-v1 # This runner is overridden for ARM platform
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64", "arm64"]' # arm64 for CPU tests, single GPU tests are skipped
run_cpu_only_tests: true
cpu_only_test_markers: pre_merge and trtllm and gpu_0
gpu_test_markers: pre_merge and trtllm and gpu_1
secrets: inherit
trtllm-multi-gpu-test:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [changed-files, trtllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: trtllm
test_type: Multi-GPU Test
amd_runner: prod-tester-amd-gpu-4-v1
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64"]' # No ARM GPUs available
run_sanity_check: false
gpu_test_markers: pre_merge and trtllm and (gpu_2 or gpu_4)
gpu_test_timeout_minutes: 60
secrets: inherit
planner-test:
name: planner # This name overlaps with other planner jobs to group them in the UI
needs: [changed-files, planner-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true'
uses: ./.github/workflows/shared-test.yml
with:
test_suite_name: planner
test_type: CPU Test
amd_runner: prod-tester-amd-gpu-v1 # TODO: CPU only DinD runner for dynamo repo
target_tag_plain: ${{ needs.planner-build.outputs.target_tag_plain }}
cuda_version: '[""]'
platform: '["amd64"]'
run_sanity_check: false
run_cpu_only_tests: true
cpu_only_test_markers: 'pre_merge and planner and gpu_0'
cpu_only_test_timeout_minutes: 30
run_gpu_tests: false
secrets: inherit
# ============================================================================
# IMAGE COMPLIANCE PIPELINES
# ============================================================================
vllm-compliance:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [changed-files, vllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-compliance.yml
with:
framework: vllm
target: runtime
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]'
secrets: inherit
sglang-compliance:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [changed-files, sglang-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-compliance.yml
with:
framework: sglang
target: runtime
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
platform: '["amd64"]'
secrets: inherit
trtllm-compliance:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [changed-files, trtllm-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true'
uses: ./.github/workflows/shared-compliance.yml
with:
framework: trtllm
target: runtime
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
platform: '["amd64"]'
secrets: inherit
planner-compliance:
name: planner # This name overlaps with other planner jobs to group them in the UI
needs: [changed-files, planner-build]
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.planner == 'true'
uses: ./.github/workflows/shared-compliance.yml
with:
framework: dynamo
target: planner
target_tag_plain: ${{ needs.planner-build.outputs.target_tag_plain }}
cuda_version: '[""]'
platform: '["amd64"]'
secrets: inherit
# ============================================================================
# IMAGE COPY PIPELINES
# ============================================================================
vllm-copy-to-acr:
name: vllm-runtime # This name overlaps with other vllm jobs to group them in the UI
needs: [changed-files, vllm-build, vllm-test]
if: |
always() &&
(needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true') &&
needs.vllm-build.result == 'success' &&
(needs.vllm-test.result == 'success' || needs.vllm-test.result == 'skipped')
uses: ./.github/workflows/shared-copy.yml
with:
target_tag_plain: ${{ needs.vllm-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
override_arch: amd64 # We are using AMD64 images only on the rest of the clusters.
copy_timeout_minutes: 10
secrets: inherit
sglang-copy-to-acr:
name: sglang-runtime # This name overlaps with other sglang jobs to group them in the UI
needs: [changed-files, sglang-build, sglang-test]
if: |
always() &&
(needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true') &&
needs.sglang-build.result == 'success' &&
(needs.sglang-test.result == 'success' || needs.sglang-test.result == 'skipped')
uses: ./.github/workflows/shared-copy.yml
with:
target_tag_plain: ${{ needs.sglang-build.outputs.target_tag_plain }}
cuda_version: '["12.9", "13.0"]'
override_arch: amd64 # We are using AMD64 images only on the rest of the clusters.
copy_timeout_minutes: 10
secrets: inherit
trtllm-copy-to-acr:
name: trtllm-runtime # This name overlaps with other trtllm jobs to group them in the UI
needs: [changed-files, trtllm-build, trtllm-test]
if: |
always() &&
(needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true') &&
needs.trtllm-build.result == 'success' &&
(needs.trtllm-test.result == 'success' || needs.trtllm-test.result == 'skipped')
uses: ./.github/workflows/shared-copy.yml
with:
target_tag_plain: ${{ needs.trtllm-build.outputs.target_tag_plain }}
cuda_version: '["13.1"]'
override_arch: amd64 # We are using AMD64 images only on the rest of the clusters.
copy_timeout_minutes: 10
secrets: inherit
# ============================================================================
# DEPLOY TEST PIPELINES
# ============================================================================
deploy-operator: deploy-operator:
if: | if: |
...@@ -346,26 +534,25 @@ jobs: ...@@ -346,26 +534,25 @@ jobs:
vcluster_name: ${{ steps.setup.outputs.vcluster_name }} vcluster_name: ${{ steps.setup.outputs.vcluster_name }}
operator_tag: ${{ steps.setup.outputs.operator_tag }} operator_tag: ${{ steps.setup.outputs.operator_tag }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Setup vCluster and operator - name: Setup vCluster and operator
id: setup id: setup
uses: ./.github/actions/setup-dynamo-operator uses: ./.github/actions/setup-dynamo-operator
with: with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
registry: ${{ secrets.AZURE_ACR_HOSTNAME }} registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
operator_tag: ${{ needs.operator.result == 'success' && needs.operator.outputs.operator_default_tag || 'main-operator' }} operator_tag: ${{ needs.operator.result == 'success' && needs.operator.outputs.operator_default_tag || 'main-operator' }}
hf_token: ${{ secrets.HF_TOKEN }} hf_token: ${{ secrets.HF_TOKEN }}
dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }} dockerhub_username: ${{ secrets.DOCKERHUB_LOGIN_USER }}
dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }} dockerhub_password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}
deploy-test-vllm: deploy-test-vllm:
name: vllm Deploy Test
needs: [changed-files, deploy-operator, vllm-copy-to-acr, vllm-multi-gpu-test]
if: | if: |
!cancelled() && !failure() && !cancelled() && !failure() &&
(needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true') && (needs.changed-files.outputs.vllm == 'true' || needs.changed-files.outputs.deploy == 'true')
needs.deploy-operator.result == 'success' && uses: ./.github/workflows/shared-deploy-test.yml
needs.vllm-pipeline.result == 'success'
needs: [changed-files, deploy-operator, vllm-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml
with: with:
framework: vllm framework: vllm
profiles: '["agg", "agg_router", "disagg", "disagg_router"]' profiles: '["agg", "agg_router", "disagg", "disagg_router"]'
...@@ -376,13 +563,12 @@ jobs: ...@@ -376,13 +563,12 @@ jobs:
secrets: inherit secrets: inherit
deploy-test-sglang: deploy-test-sglang:
name: sglang Deploy Test
needs: [changed-files, deploy-operator, sglang-copy-to-acr, sglang-multi-gpu-test]
if: | if: |
!cancelled() && !failure() && !cancelled() && !failure() &&
(needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true') && (needs.changed-files.outputs.sglang == 'true' || needs.changed-files.outputs.deploy == 'true')
needs.deploy-operator.result == 'success' && uses: ./.github/workflows/shared-deploy-test.yml
needs.sglang-pipeline.result == 'success'
needs: [changed-files, deploy-operator, sglang-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml
with: with:
framework: sglang framework: sglang
profiles: '["agg", "agg_router"]' profiles: '["agg", "agg_router"]'
...@@ -393,13 +579,12 @@ jobs: ...@@ -393,13 +579,12 @@ jobs:
secrets: inherit secrets: inherit
deploy-test-trtllm: deploy-test-trtllm:
name: trtllm Deploy Test
needs: [changed-files, deploy-operator, trtllm-copy-to-acr, trtllm-multi-gpu-test]
if: | if: |
!cancelled() && !failure() && !cancelled() && !failure() &&
(needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true') && (needs.changed-files.outputs.trtllm == 'true' || needs.changed-files.outputs.deploy == 'true')
needs.deploy-operator.result == 'success' && uses: ./.github/workflows/shared-deploy-test.yml
needs.trtllm-pipeline.result == 'success'
needs: [changed-files, deploy-operator, trtllm-pipeline]
uses: ./.github/workflows/shared-deploy-test-framework.yml
with: with:
framework: trtllm framework: trtllm
profiles: '["agg", "agg_router"]' profiles: '["agg", "agg_router"]'
...@@ -414,14 +599,43 @@ jobs: ...@@ -414,14 +599,43 @@ jobs:
needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] needs: [deploy-operator, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm]
runs-on: prod-default-small-v2 runs-on: prod-default-small-v2
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Teardown vCluster - name: Teardown vCluster
if: needs.deploy-operator.outputs.namespace != '' && needs.deploy-operator.outputs.vcluster_name != '' if: needs.deploy-operator.outputs.namespace != '' && needs.deploy-operator.outputs.vcluster_name != ''
uses: ./.github/actions/teardown-dynamo-operator uses: ./.github/actions/teardown-dynamo-operator
with: with:
kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }} kubeconfig_base64: ${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}
vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }} vcluster_name: ${{ needs.deploy-operator.outputs.vcluster_name }}
vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }} vcluster_namespace: ${{ needs.deploy-operator.outputs.namespace }}
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs:
- changed-files
- operator
- planner-test
- vllm-copy-to-acr
- vllm-multi-gpu-test
- sglang-copy-to-acr
- sglang-multi-gpu-test
- trtllm-copy-to-acr
- trtllm-multi-gpu-test
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
buildkit_worker_addresses: ''
skip_bootstrap: true
- name: Builder Cleanup in case of k8s builder
shell: bash
run: |
docker buildx rm ${{ needs.changed-files.outputs.builder_name }} || true
# ============================================================================ # ============================================================================
# ALLURE REPORT # ALLURE REPORT
...@@ -429,9 +643,26 @@ jobs: ...@@ -429,9 +643,26 @@ jobs:
# ============================================================================ # ============================================================================
allure-report: allure-report:
needs: [vllm-pipeline, sglang-pipeline, trtllm-pipeline, deploy-test-vllm, deploy-test-sglang, deploy-test-trtllm] needs:
# Disabled: gh-pages branch bloated to ~1GB after 72 commits of Allure reports - changed-files
- operator
- vllm-build
- vllm-test
- vllm-multi-gpu-test
- vllm-copy-to-acr
- sglang-build
- sglang-test
- sglang-multi-gpu-test
- sglang-copy-to-acr
- trtllm-build
- trtllm-test
- trtllm-multi-gpu-test
- trtllm-copy-to-acr
- deploy-test-vllm
- deploy-test-sglang
- deploy-test-trtllm
if: false if: false
# Disabled: gh-pages branch bloated to ~1GB after 72 commits of Allure reports
# if: ${{ !cancelled() }} # if: ${{ !cancelled() }}
uses: ./.github/workflows/generate-allure-report.yml uses: ./.github/workflows/generate-allure-report.yml
with: with:
...@@ -441,28 +672,3 @@ jobs: ...@@ -441,28 +672,3 @@ jobs:
permissions: permissions:
contents: write contents: write
actions: read actions: read
# ============================================================================
# CLEANUP JOBS
# Clean up ephemeral Kubernetes namespace and resources
# ============================================================================
clean-k8s-builder:
name: Clean K8s builder if exists
runs-on: prod-default-small-v2
if: always()
needs: [planner-pipeline, vllm-pipeline, sglang-pipeline, trtllm-pipeline, operator, changed-files]
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Create K8s builders (skip bootstrap)
uses: ./.github/actions/bootstrap-buildkit
continue-on-error: true
with:
builder_name: ${{ needs.changed-files.outputs.builder_name }}
buildkit_worker_addresses: '' # k8s builder
skip_bootstrap: true
- name: Builder Cleanup in case of k8s builder
shell: bash
run: |
docker buildx rm ${{ needs.changed-files.outputs.builder_name }} || true
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Shared Build Image
on:
workflow_call:
inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
type: string
target:
description: 'Target stage for Docker rendering'
required: false
type: string
default: 'runtime'
cuda_version:
description: 'CUDA versions to build as a JSON array'
required: true
type: string
platform:
description: 'Target platforms to build as a JSON array'
required: true
type: string
builder_name:
description: 'Buildkit builder name'
required: true
type: string
build_timeout_minutes:
description: 'Timeout in minutes for the build step'
required: false
type: number
default: 60
extra_tags:
description: 'Additional tags (newline-separated, -$platform suffix auto-appended)'
required: false
type: string
default: ''
no_cache:
description: 'Disable Docker build cache'
required: false
type: boolean
default: false
fresh_builder:
description: 'Always create a fresh K8s BuildKit builder (skip remote worker routing)'
required: false
type: boolean
default: false
push_image:
description: 'Push image to registry'
required: false
type: boolean
default: true
no_load:
description: 'Do not load the image into docker'
required: false
type: boolean
default: true
show_summary:
description: 'Show summary'
required: false
type: boolean
default: false
make_efa:
description: 'Enable AWS EFA support in the build'
required: false
type: boolean
default: false
sanitized_ref_name:
description: 'Sanitized git ref name for branch-tagged images'
required: false
type: string
default: ''
build_only:
description: 'Build and push only skip tests and prepare branch tags'
required: false
type: boolean
default: false
extra_build_args:
description: 'Extra build args to pass to docker build (newline-separated)'
required: false
type: string
default: ''
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
SCCACHE_S3_BUCKET:
required: false
AWS_ACCESS_KEY_ID:
required: false
AWS_SECRET_ACCESS_KEY:
required: false
HF_TOKEN:
required: false
outputs:
target_tag_plain:
description: 'Plain runtime image tag prefix'
value: ${{ jobs.build.outputs.target_tag_plain }}
jobs:
build:
strategy:
fail-fast: false
matrix:
cuda_version: ${{ fromJson(inputs.cuda_version) }}
runs-on: prod-builder-v3
# cuda_version not empty -- name: cuda12, linux/amd64
# cuda_version empty -- name: cpu, linux/amd64
name: Build multi-arch ${{ matrix.cuda_version == '' && 'cpu' || format('cuda{0}', matrix.cuda_version) }}
outputs:
target_tag_plain: ${{ steps.calculate-target-tag.outputs.target_tag_plain }}
test_tag_plain: ${{ steps.calculate-target-tag.outputs.test_tag_plain }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate target tag
id: calculate-target-tag
shell: bash
run: |
TAG_BUILDER=${{ inputs.framework }}-${{ inputs.target }}
if [ "${{ inputs.make_efa }}" == "true" ]; then
TAG_BUILDER+="-efa"
fi
TARGET_TAG_PLAIN=${TAG_BUILDER}
if [ "${{ matrix.cuda_version }}" != "" ]; then
CUDA_VERSION="${{ matrix.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
TAG_BUILDER+="-cuda${CUDA_MAJOR}"
fi
IMAGE_TAG=${{ github.sha }}-${TAG_BUILDER}
TEST_IMAGE_TAG=${{ github.sha }}-${TAG_BUILDER}-test
IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${IMAGE_TAG}"
TEST_IMAGE_URI="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo:${TEST_IMAGE_TAG}"
echo "target_tag_plain=${TARGET_TAG_PLAIN}" >> $GITHUB_OUTPUT
echo "image_uri=${IMAGE_URI}" >> $GITHUB_OUTPUT
echo "test_image_uri=${TEST_IMAGE_URI}" >> $GITHUB_OUTPUT
- name: Calculate Builder Flavor
id: calculate-builder-flavor
shell: bash
run: |
if [[ ${{ inputs.framework }} != @(vllm|sglang|trtllm) ]]; then
echo "builder_flavor=general" >> $GITHUB_OUTPUT
else
echo "builder_flavor=${{ inputs.framework }}" >> $GITHUB_OUTPUT
fi
- name: Initialize Dynamo Builder
uses: ./.github/actions/init-dynamo-builder
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ steps.calculate-builder-flavor.outputs.builder_flavor }}
arch: ${{ inputs.platform }}
cuda_version: ${{ matrix.cuda_version }}
fresh_builder: ${{ inputs.fresh_builder }}
- name: Calculate extra tags
id: extra-tags
shell: bash
env:
EXTRA_TAGS: ${{ inputs.extra_tags }}
run: |
ECR_REGISTRY="${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com"
RESULT=""
if [ -n "$EXTRA_TAGS" ]; then
while IFS= read -r tag; do
if [ -n "$tag" ]; then
RESULT+="${ECR_REGISTRY}/ai-dynamo/dynamo:${tag}"$'\n'
fi
done <<< "$EXTRA_TAGS"
fi
if [ -n "$RESULT" ]; then
echo "tags<<EOF" >> $GITHUB_OUTPUT
echo "$RESULT" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
else
echo "tags=" >> $GITHUB_OUTPUT
fi
- name: Print Build Container inputs
shell: bash
run: |
echo "=== Build Container Inputs ==="
echo "image_uri: ${{ steps.calculate-target-tag.outputs.image_uri }}"
echo "framework: ${{ inputs.framework }}"
echo "target: ${{ inputs.target }}"
echo "platform: ${{ inputs.platform }}"
echo "no_cache: ${{ inputs.no_cache }}"
echo "extra_tags: ${{ steps.extra-tags.outputs.tags }}"
echo "push_image: ${{ inputs.push_image }}"
echo "no_load: ${{ inputs.no_load }}"
echo "build_timeout_minutes: ${{ inputs.build_timeout_minutes }}"
- name: Generate Dockerfile
shell: bash
run: |
echo "Generating Dockerfile for target: ${{ inputs.target }} and framework: ${{ inputs.framework }}"
MAKE_EFA_FLAG=""
if [ "${{ inputs.make_efa }}" == "true" ]; then
MAKE_EFA_FLAG="--make-efa"
fi
# If CUDA version is empty, use empty arg to fallback to default (eg. for planner)
if [ "${{ matrix.cuda_version }}" == "" ]; then
CUDA_FLAG=""
else
CUDA_FLAG="--cuda-version=${{ matrix.cuda_version }}"
fi
python ./container/render.py \
--target=${{ inputs.target }} \
--framework=${{ inputs.framework }} \
--platform=${{ inputs.platform }} \
${CUDA_FLAG} \
${MAKE_EFA_FLAG} \
--show-result \
--output-short-filename
- name: Build and Push Image
id: build-image
uses: ./.github/actions/docker-remote-build
with:
image_tag: ${{ steps.calculate-target-tag.outputs.image_uri }}
framework: ${{ inputs.framework }}
target: ${{ inputs.target }}
platform: ${{ inputs.platform }}
cuda_version: ${{ matrix.cuda_version }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
no_cache: ${{ inputs.no_cache }}
extra_tags: ${{ steps.extra-tags.outputs.tags }}
push_image: ${{ inputs.push_image }}
no_load: ${{ inputs.no_load }}
extra_build_args: |
DYNAMO_COMMIT_SHA=${{ github.sha }}
${{ inputs.extra_build_args }}
- name: Refresh BuildKit builder
if: ${{ inputs.target != 'dev' }}
uses: ./.github/actions/builder-refresher
with:
builder_name: ${{ inputs.builder_name }}
flavor: ${{ steps.calculate-builder-flavor.outputs.builder_flavor }}
arch: ${{ inputs.platform }}
cuda_version: ${{ matrix.cuda_version }}
- name: Build and Push Test Image
if: ${{ inputs.target != 'dev' }}
shell: bash
env:
ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
run: |
PLAIN_TAG="${{ steps.calculate-target-tag.outputs.target_tag_plain }}"
CACHE_TAG="test-${PLAIN_TAG}-cache"
CACHE_ARGS="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG}"
CACHE_ARGS+=" --cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG}"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${CACHE_TAG},mode=max"
elif [[ "$GITHUB_REF_NAME" =~ ^release ]]; then
CACHE_ARGS+=" --cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:release-${CACHE_TAG},mode=max"
fi
PUSH_ARGS=""
if [ "${{ inputs.push_image }}" == "true" ]; then
PUSH_ARGS="--push"
elif [ "${{ inputs.no_load }}" == "false" ]; then
PUSH_ARGS="--load"
fi
NO_CACHE_ARG=""
if [ "${{ inputs.no_cache }}" == "true" ]; then
NO_CACHE_ARG="--no-cache"
fi
docker buildx build \
--progress=plain \
${PUSH_ARGS} \
${NO_CACHE_ARG} \
--platform ${{ inputs.platform }} \
-f container/Dockerfile.test \
--build-arg BASE_IMAGE=${{ steps.calculate-target-tag.outputs.image_uri }} \
${CACHE_ARGS} \
-t ${{ steps.calculate-target-tag.outputs.test_image_uri }} .
- name: Show summary
shell: bash
if: ${{ inputs.push_image == 'true' && inputs.show_summary == 'true' }}
run: |
echo "### 🐳 ${{ steps.calculate-target-tag.outputs.target_tag_plain }} Default Image" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
echo "| \`${{ steps.calculate-target-tag.outputs.image_uri }}\` |" >> $GITHUB_STEP_SUMMARY
EXTRA_TAGS="${{ steps.extra-tags.outputs.tags }}"
if [ -n "$EXTRA_TAGS" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🏷️ Extra Tags" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Image URI |" >> $GITHUB_STEP_SUMMARY
echo "|-----|" >> $GITHUB_STEP_SUMMARY
while IFS= read -r tag; do
if [ -n "$tag" ]; then
echo "| \`${tag}\` |" >> $GITHUB_STEP_SUMMARY
fi
done <<< "$EXTRA_TAGS"
fi
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Shared Compliance Scan
on:
workflow_call:
inputs:
framework:
description: 'Framework name (vllm, sglang, trtllm)'
required: true
type: string
target:
description: 'Target docker stage'
required: true
type: string
target_tag_plain:
description: 'Plain runtime image tag prefix from the build workflow'
required: true
type: string
cuda_version:
description: 'CUDA versions to test as a JSON array'
required: true
type: string
platform:
description: 'Target platforms to test as a JSON array'
required: true
type: string
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
jobs:
compliance:
strategy:
fail-fast: false
matrix:
platform: ${{ fromJson(inputs.platform) }}
cuda_version: ${{ fromJson(inputs.cuda_version) }}
runs-on: prod-builder-v3
# cuda_version not empty -- name: cuda12, linux/amd64
# cuda_version empty -- name: cpu, linux/amd64
name: Compliance ${{ matrix.cuda_version == '' && 'cpu' || format('cuda{0}', matrix.cuda_version) }}, ${{ matrix.platform }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Calculate target tag
id: calculate-target-tag
shell: bash
env:
ECR_REPOSITORY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo
run: |
CUDA_VERSION="${{ matrix.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
if [[ "${{ inputs.target_tag_plain }}" == *"planner"* ]]; then
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}
else
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}-cuda${CUDA_MAJOR}
fi
RUNTIME_IMAGE=${ECR_REPOSITORY}:${IMAGE_TAG}
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
- name: Compliance scan
uses: ./.github/actions/compliance-scan
with:
image: ${{ steps.calculate-target-tag.outputs.runtime_image }}
artifact_name: compliance-${{ inputs.target_tag_plain }}-${{ matrix.cuda_version }}-${{ matrix.platform }}
arch: ${{ matrix.platform }}
framework: ${{ inputs.framework }}
target: ${{ inputs.target }}
cuda_version: ${{ matrix.cuda_version }}
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Shared Copy Image
on:
workflow_call:
inputs:
target_tag_plain:
description: 'Plain runtime image tag prefix from the build workflow'
required: true
type: string
cuda_version:
description: 'CUDA versions to copy as a JSON array'
required: true
type: string
copy_timeout_minutes:
description: 'Timeout in minutes for the copy step'
required: false
type: number
default: 10
override_arch:
description: 'Override architecture for the copied image (e.g., "amd64")'
required: false
type: string
default: ''
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
jobs:
copy-to-acr:
strategy:
fail-fast: false
matrix:
cuda_version: ${{ fromJson(inputs.cuda_version) }}
name: Copy to ACR cuda${{ matrix.cuda_version }}${{ inputs.override_arch != '' && format(', {0}', inputs.override_arch) || '' }}
runs-on: prod-default-small-v2
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
env:
ECR_REPOSITORY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo
run: |
CUDA_VERSION="${{ matrix.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
if [[ "${{ inputs.target_tag_plain }}" == *"planner"* ]]; then
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}
else
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}-cuda${CUDA_MAJOR}
fi
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
- name: Copy image to target registry
timeout-minutes: ${{ inputs.copy_timeout_minutes }}
uses: ./.github/actions/skopeo-copy
with:
source_registry: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com
source_image: ai-dynamo/dynamo
source_tag: ${{ steps.calculate-target-tag.outputs.image_tag }}
target_registry: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_image: ai-dynamo/dynamo
target_tag: ${{ steps.calculate-target-tag.outputs.image_tag }}
source_aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
source_aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
target_azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
target_azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
target_azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
override_arch: ${{ inputs.override_arch }} # We are using AMD64 images only on the rest of the clusters.
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
name: Deploy Test Framework name: Shared Deploy Test
on: on:
workflow_call: workflow_call:
...@@ -42,7 +42,7 @@ jobs: ...@@ -42,7 +42,7 @@ jobs:
max-parallel: 2 max-parallel: 2
matrix: matrix:
profile: ${{ fromJSON(inputs.profiles) }} profile: ${{ fromJSON(inputs.profiles) }}
name: deploy-test-${{ inputs.framework }} (${{ matrix.profile }}) name: ${{ matrix.profile }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
name: Shared Local GPU and CPU Test
on:
workflow_call:
inputs:
test_suite_name:
description: 'Test suite name (vllm, sglang, trtllm)'
required: true
type: string
test_type:
description: 'Test type (e.g. single-gpu, multi-gpu)'
required: true
type: string
amd_runner:
description: 'Runner to execute tests on (amd64 only)'
required: true
type: string
target_tag_plain:
description: 'Plain runtime image tag prefix from the build workflow'
required: true
type: string
cuda_version:
description: 'CUDA versions to test as a JSON array'
required: true
type: string
platform:
description: 'Target platforms to test as a JSON array'
required: true
type: string
run_sanity_check:
description: 'Whether to run sanity check on the runtime image before executing tests'
required: false
type: boolean
default: true
run_cpu_only_tests:
description: 'Whether to run CPU-only tests'
required: false
type: boolean
default: false
cpu_only_test_markers:
description: 'CPU-only pytest markers'
required: false
type: string
cpu_only_test_timeout_minutes:
description: 'Timeout in minutes for CPU tests'
required: false
type: number
default: 10
run_gpu_tests:
description: 'Whether to run GPU tests'
required: false
type: boolean
default: true
gpu_test_markers:
description: 'GPU pytest markers'
required: false
type: string
gpu_test_timeout_minutes:
description: 'Timeout in minutes for GPU tests'
required: false
type: number
default: 30
secrets:
AWS_DEFAULT_REGION:
required: true
AWS_ACCOUNT_ID:
required: true
AZURE_ACR_HOSTNAME:
required: true
AZURE_ACR_USER:
required: true
AZURE_ACR_PASSWORD:
required: true
HF_TOKEN:
required: false
jobs:
test:
strategy:
fail-fast: false
matrix:
platform: ${{ fromJson(inputs.platform) }}
cuda_version: ${{ fromJson(inputs.cuda_version) }}
name: ${{ inputs.test_type }} ${{ matrix.cuda_version == '' && 'cpu' || format('cuda{0}', matrix.cuda_version) }}, ${{ matrix.platform }}
runs-on: ${{ matrix.platform == 'amd64' && inputs.amd_runner || 'prod-tester-arm-v1' }}
steps:
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Calculate target tag
id: calculate-target-tag
shell: bash
env:
ECR_REPOSITORY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com/ai-dynamo/dynamo
run: |
CUDA_VERSION="${{ matrix.cuda_version }}"
CUDA_MAJOR=${CUDA_VERSION%%.*}
if [[ "${{ inputs.target_tag_plain }}" == *"planner"* ]]; then
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}
else
IMAGE_TAG=${{ github.sha }}-${{ inputs.target_tag_plain }}-cuda${CUDA_MAJOR}
fi
RUNTIME_IMAGE=${ECR_REPOSITORY}:${IMAGE_TAG}
TEST_IMAGE=${ECR_REPOSITORY}:${IMAGE_TAG}-test
echo "runtime_image=${RUNTIME_IMAGE}" >> $GITHUB_OUTPUT
echo "test_image=${TEST_IMAGE}" >> $GITHUB_OUTPUT
- name: Docker Login
uses: ./.github/actions/docker-login
with:
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Pull relevant images
shell: bash
run: |
source ./.github/scripts/retry_docker.sh
start_time=$(date +%s)
retry_pull ${{ steps.calculate-target-tag.outputs.runtime_image }}
retry_pull ${{ steps.calculate-target-tag.outputs.test_image }}
retry_pull quay.io/minio/minio
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "⏱️ Image pull duration: ${duration}s"
- name: Run Sanity Check on Runtime Image
if: ${{ inputs.run_sanity_check }}
shell: bash
run: |
echo "Running sanity check on image: ${{ steps.calculate-target-tag.outputs.runtime_image }}"
export WORKSPACE=/workspace
set +e
docker run --rm "${{ steps.calculate-target-tag.outputs.runtime_image }}" python ${WORKSPACE}/deploy/sanity_check.py --runtime-check --no-gpu-check
SANITY_CHECK_EXIT_CODE=$?
set -e
if [ ${SANITY_CHECK_EXIT_CODE} -ne 0 ]; then
echo "ERROR: Sanity check failed - ai-dynamo packages not properly installed"
exit ${SANITY_CHECK_EXIT_CODE}
else
echo "✅ Sanity check passed"
fi
- name: Run CPU-only tests (parallelized)
if: ${{ inputs.run_cpu_only_tests }}
timeout-minutes: ${{ inputs.cpu_only_test_timeout_minutes }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ inputs.cpu_only_test_markers }}
test_suite_name: ${{ inputs.test_suite_name }}
test_type: "pre_merge_cpu"
platform_arch: ${{ matrix.platform }}
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'auto'
dind_as_sidecar: 'true'
- name: Run GPU tests (sequential)
timeout-minutes: ${{ inputs.gpu_test_timeout_minutes }}
if: ${{ matrix.platform == 'amd64' && inputs.run_gpu_tests }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.calculate-target-tag.outputs.test_image }}
pytest_marks: ${{ inputs.gpu_test_markers }}
test_suite_name: ${{ inputs.test_suite_name }}
test_type: "pre_merge_gpu"
platform_arch: ${{ matrix.platform }}
hf_token: ${{ secrets.HF_TOKEN }}
parallel_mode: 'none'
dind_as_sidecar: 'true'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment